from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"CASD : Conversion de l’extraction SAS en Apache Parquet
year = "2020"
# year = "2018"
SAS_FILE = (
r"C:\Users\Public\Documents\TRAVAIL\agregats\sas/"
#+ "extrait_dads_2020.sas7bdat"
+ "extrait_dads_2020_220809.sas7bdat"
)
OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\agregats\data\chunks\"
OUT_PATH = OUT_PATH + "extrait_dads_" + year + r"-chunk/"
taille_chunk = 2 * 2**20import shutil
from pathlib import Path
import pandas as pd
import vaex
from tqdm import tqdmdef clean_chunk(chunk):
chunk.columns = [c.lower() for c in chunk.columns.to_list()]
for (
col
) in "NB_POSTES_DU_NIR NBHEUR EQTP DUREE EFF_3112 S_BRUT PEPA NET".lower().split(
" "
):
chunk[col].fillna(0, inplace=True)
return chunkLecture du fichier SAS
On va lire le fichier par morceau de 1 million de lignes, pour ne pas saturer la mémoire. Il y a 39 millions de lignes.
On va les enregistrer au fur et à mesure en format Apache Arrow.
# Temps sur CASD : < 20 minutes.
# Efface le dossier de sortie
shutil.rmtree(OUT_PATH, ignore_errors=True)
Path(OUT_PATH).mkdir(parents=True, exist_ok=True)
dfi = pd.read_sas(
SAS_FILE, chunksize=taille_chunk, encoding="iso8859-15", iterator=True
)
dd_values = None
i = 0
print(f"Nombre d'itérations : {61_689_822/taille_chunk:.0f}")
for chunk in tqdm(dfi):
del dd_values
dd_values = None
chunk = clean_chunk(chunk)
dd_values = vaex.from_pandas(chunk, copy_index=False)
dd_values.export(f"{OUT_PATH}{year}_{i}.parquet")
#### DEBUG
i += 1
# if i>=2:
# break
#### DEBUGNombre d'itérations : 31
23it [19:00, 49.60s/it]
CPU times: total: 18min 18s
Wall time: 19min 1s
chunk.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653064 entries, 44000000 to 44653063
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ident_s 653064 non-null float64
1 filt 653064 non-null object
2 nb_postes_du_nir 653064 non-null float64
3 nbheur 653064 non-null float64
4 eqtp 653064 non-null float64
5 duree 653064 non-null float64
6 domempl 653064 non-null object
7 datdeb 653059 non-null float64
8 a17 653058 non-null object
9 a88 653058 non-null object
10 cris 652813 non-null object
11 treffect 653064 non-null object
12 eff_3112 653064 non-null float64
13 contrat_travail 653059 non-null object
14 motifcdd 279393 non-null object
15 s_brut 653064 non-null float64
16 pepa 653064 non-null float64
17 net 653064 non-null float64
dtypes: float64(10), object(8)
memory usage: 89.7+ MB
pd.set_option("display.max_columns", None)
chunkchunk.describe()| ident_s | nb_postes_du_nir | nbheur | eqtp | duree | datdeb | eff_3112 | s_brut | pepa | net | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 6.530640e+05 | 653064.000000 | 653064.000000 | 653064.000000 | 653064.000000 | 653059.000000 | 653064.000000 | 6.530640e+05 | 653064.000000 | 6.530640e+05 |
| mean | 1.716290e+07 | 1.415869 | 463.290001 | 0.204568 | 130.023061 | 124.084743 | 1069.990837 | 6.988259e+03 | 19.715148 | 4.699575e+03 |
| std | 9.919750e+06 | 1.061930 | 530.080587 | 0.264277 | 123.362161 | 119.068893 | 7376.962829 | 3.081417e+04 | 126.067164 | 2.573969e+04 |
| min | 2.600000e+01 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | -30.000000 | 0.000000 | -1.152800e+04 | 0.000000 | 0.000000e+00 |
| 25% | 8.576866e+06 | 1.000000 | 74.000000 | 0.011540 | 30.000000 | 1.000000 | 2.000000 | 9.122926e+02 | 0.000000 | 1.872300e+02 |
| 50% | 1.717066e+07 | 1.000000 | 250.000000 | 0.087090 | 88.000000 | 91.000000 | 10.000000 | 2.886169e+03 | 0.000000 | 1.587120e+03 |
| 75% | 2.575884e+07 | 1.000000 | 642.000000 | 0.288190 | 210.000000 | 238.000000 | 79.000000 | 7.681662e+03 | 0.000000 | 5.377038e+03 |
| max | 3.432549e+07 | 32.000000 | 12395.000000 | 1.000000 | 360.000000 | 360.000000 | 190828.000000 | 1.917958e+07 | 2000.000000 | 1.680945e+07 |
chunk.query("s_brut < 0")