CASD : Conversion de SAS en Apache Parquet

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
### year = "2019"
year = "2020"
# year = "2018"
SAS_FILE = r"C:\Users\Public\Documents\TRAVAIL\agregats\sas\per_pote_2020.sas7bdat"
OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\pote_brut\per_pote_2020/"
# OUT_PATH = OUT_PATH + "assiettes_pote_brutes_" + year + "-chunk/"
taille_chunk = 2**20  # 2**20 = 1_048_576
# taille_chunk = 5000
# taille_chunk = 6000
import shutil
from pathlib import Path

import pandas as pd
import vaex
from tqdm import tqdm
# Temps sur CASD : < 20 minutes.

# Efface le dossier de sortie
shutil.rmtree(OUT_PATH, ignore_errors=True)
Path(OUT_PATH).mkdir(parents=True, exist_ok=True)

dfi = pd.read_sas(
    SAS_FILE, chunksize=taille_chunk, encoding="iso8859-15", iterator=True
)

dd_values = None
i = 0
print(f"Nombre d'itérations : {39512402/taille_chunk:.0f}")
for chunk in tqdm(dfi):
    # chunk = chunk.drop(columns = ['FIP18_c'])
    chunk.columns = [c.lower() for c in chunk.columns.to_list()]
    dd_values = vaex.from_pandas(chunk, copy_index=False)
    dd_values.export(f"{OUT_PATH}pote_{year}_{i}.parquet")
    del dd_values
    dd_values = None
    #### DEBUG
    i += 1
    # if i>=2:
    #    break
    #### DEBUG
Nombre d'itérations : 38
CPU times: total: 3min 21s
Wall time: 3min 25s
38it [03:25,  5.41s/it]