CASD : Conversion de l’extraction SAS en Apache Parquet

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
year = "2020"
# year = "2018"
SAS_FILE = (
    r"C:\Users\Public\Documents\TRAVAIL\agregats\sas/"
    #+ "extrait_dads_2020.sas7bdat"
    + "extrait_dads_2020_220809.sas7bdat"
)

OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\agregats\data\chunks\"
OUT_PATH = OUT_PATH + "extrait_dads_" + year + r"-chunk/"
taille_chunk = 2 * 2**20
import shutil
from pathlib import Path

import pandas as pd
import vaex
from tqdm import tqdm
def clean_chunk(chunk):
    chunk.columns = [c.lower() for c in chunk.columns.to_list()]
    for (
        col
    ) in "NB_POSTES_DU_NIR NBHEUR EQTP DUREE EFF_3112 S_BRUT PEPA NET".lower().split(
        " "
    ):
        chunk[col].fillna(0, inplace=True)
    return chunk

Lecture du fichier SAS

On va lire le fichier par morceau de 1 million de lignes, pour ne pas saturer la mémoire. Il y a 39 millions de lignes.

On va les enregistrer au fur et à mesure en format Apache Arrow.

# Temps sur CASD : < 20 minutes.

# Efface le dossier de sortie
shutil.rmtree(OUT_PATH, ignore_errors=True)
Path(OUT_PATH).mkdir(parents=True, exist_ok=True)

dfi = pd.read_sas(
    SAS_FILE, chunksize=taille_chunk, encoding="iso8859-15", iterator=True
)

dd_values = None
i = 0
print(f"Nombre d'itérations : {61_689_822/taille_chunk:.0f}")
for chunk in tqdm(dfi):
    del dd_values
    dd_values = None
    chunk = clean_chunk(chunk)
    dd_values = vaex.from_pandas(chunk, copy_index=False)
    dd_values.export(f"{OUT_PATH}{year}_{i}.parquet")
    #### DEBUG
    i += 1
    # if i>=2:
    #     break
    #### DEBUG
Nombre d'itérations : 31
CPU times: total: 18min 18s
Wall time: 19min 1s
23it [19:00, 49.60s/it]
chunk.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653064 entries, 44000000 to 44653063
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ident_s           653064 non-null  float64
 1   filt              653064 non-null  object 
 2   nb_postes_du_nir  653064 non-null  float64
 3   nbheur            653064 non-null  float64
 4   eqtp              653064 non-null  float64
 5   duree             653064 non-null  float64
 6   domempl           653064 non-null  object 
 7   datdeb            653059 non-null  float64
 8   a17               653058 non-null  object 
 9   a88               653058 non-null  object 
 10  cris              652813 non-null  object 
 11  treffect          653064 non-null  object 
 12  eff_3112          653064 non-null  float64
 13  contrat_travail   653059 non-null  object 
 14  motifcdd          279393 non-null  object 
 15  s_brut            653064 non-null  float64
 16  pepa              653064 non-null  float64
 17  net               653064 non-null  float64
dtypes: float64(10), object(8)
memory usage: 89.7+ MB
pd.set_option("display.max_columns", None)
chunk
chunk.describe()
ident_s nb_postes_du_nir nbheur eqtp duree datdeb eff_3112 s_brut pepa net
count 6.530640e+05 653064.000000 653064.000000 653064.000000 653064.000000 653059.000000 653064.000000 6.530640e+05 653064.000000 6.530640e+05
mean 1.716290e+07 1.415869 463.290001 0.204568 130.023061 124.084743 1069.990837 6.988259e+03 19.715148 4.699575e+03
std 9.919750e+06 1.061930 530.080587 0.264277 123.362161 119.068893 7376.962829 3.081417e+04 126.067164 2.573969e+04
min 2.600000e+01 1.000000 0.000000 0.000000 1.000000 -30.000000 0.000000 -1.152800e+04 0.000000 0.000000e+00
25% 8.576866e+06 1.000000 74.000000 0.011540 30.000000 1.000000 2.000000 9.122926e+02 0.000000 1.872300e+02
50% 1.717066e+07 1.000000 250.000000 0.087090 88.000000 91.000000 10.000000 2.886169e+03 0.000000 1.587120e+03
75% 2.575884e+07 1.000000 642.000000 0.288190 210.000000 238.000000 79.000000 7.681662e+03 0.000000 5.377038e+03
max 3.432549e+07 32.000000 12395.000000 1.000000 360.000000 360.000000 190828.000000 1.917958e+07 2000.000000 1.680945e+07
chunk.query("s_brut < 0")