from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"CASD : Conversion de SAS en Apache Parquet
### year = "2019"
year = "2020"
# year = "2018"
SAS_FILE = r"C:\Users\Public\Documents\TRAVAIL\agregats\sas\per_pote_2020.sas7bdat"
OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\pote_brut\per_pote_2020/"
# OUT_PATH = OUT_PATH + "assiettes_pote_brutes_" + year + "-chunk/"
taille_chunk = 2**20 # 2**20 = 1_048_576
# taille_chunk = 5000
# taille_chunk = 6000import shutil
from pathlib import Path
import pandas as pd
import vaex
from tqdm import tqdm# Temps sur CASD : < 20 minutes.
# Efface le dossier de sortie
shutil.rmtree(OUT_PATH, ignore_errors=True)
Path(OUT_PATH).mkdir(parents=True, exist_ok=True)
dfi = pd.read_sas(
SAS_FILE, chunksize=taille_chunk, encoding="iso8859-15", iterator=True
)
dd_values = None
i = 0
print(f"Nombre d'itérations : {39512402/taille_chunk:.0f}")
for chunk in tqdm(dfi):
# chunk = chunk.drop(columns = ['FIP18_c'])
chunk.columns = [c.lower() for c in chunk.columns.to_list()]
dd_values = vaex.from_pandas(chunk, copy_index=False)
dd_values.export(f"{OUT_PATH}pote_{year}_{i}.parquet")
del dd_values
dd_values = None
#### DEBUG
i += 1
# if i>=2:
# break
#### DEBUGNombre d'itérations : 38
38it [03:25, 5.41s/it]
CPU times: total: 3min 21s
Wall time: 3min 25s