from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"Distrib bi-variée de catégorie
year = "2020"
# year = "2018"
# C:\Users\Public\Documents\TRAVAIL\agregats\data\chunks\extrait_dads_2020-chunk
OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\agregats\data\DADS/"
ARROW_PATH = OUT_PATH + "/../chunks/extrait_dads_" + year + r"-chunk/"
taille_chunk = 2 * 2**20 # 2**20 = 1_048_576
# taille_chunk = 5000import leximpact_prepare_data
leximpact_prepare_data.__version__'0.0.15'
import json
import pandas as pd
import vaex
from tqdm import tqdm
from leximpact_prepare_data.scenario_tools.calib_and_copules import *dfv = vaex.open(ARROW_PATH + "*")
tc.assertEqual(len(dfv), 61_689_822)CPU times: total: 328 ms
Wall time: 339 ms
# dfv.info()dfv.get_column_names()['a17',
'a88',
'contrat_travail',
'cris',
'eff_3112',
'motifcdd',
'net',
'pepa',
's_brut',
'treffect']
categorical = ["A17", "CONTRAT_TRAVAIL", "TREFFECT"]for col_maj in tqdm(["CONTRAT_TRAVAIL", "TREFFECT"]):
df = pd.read_csv(
r"C:\Users\Public\Documents\TRAVAIL\agregats\data\DADS\count_2020_"
+ col_maj
+ ".csv",
dtype={col_maj: str},
)
col = col_maj.lower()
secondary = "pepa"
nb_quantile = 100
for cat in df[col_maj].tolist():
# a == a is False if NaN
if cat == cat:
vdf_cat = dfv[dfv[col] == str(cat)]
# print(cat, vdf_cat.count())
q = Quantile(vdf_cat[secondary].tolist())
q_dict = q.get_quantile(nb_quantile)
# keep_upper_bound_secret(q_dict)
with open(
f"{OUT_PATH}/quantile_DADS_{col}_{cat}_{nb_quantile}_{year}_{col}.json",
"w",
) as f:
f.write(json.dumps(q_dict))