Distrib bi-variée de catégorie

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
year = "2020"
# year = "2018"
# C:\Users\Public\Documents\TRAVAIL\agregats\data\chunks\extrait_dads_2020-chunk
OUT_PATH = r"C:\Users\Public\Documents\TRAVAIL\agregats\data\DADS/"
ARROW_PATH = OUT_PATH + "/../chunks/extrait_dads_" + year + r"-chunk/"
taille_chunk = 2 * 2**20  # 2**20 = 1_048_576
# taille_chunk = 5000
import leximpact_prepare_data

leximpact_prepare_data.__version__
'0.0.15'
import json

import pandas as pd
import vaex
from tqdm import tqdm

from leximpact_prepare_data.scenario_tools.calib_and_copules import *
dfv = vaex.open(ARROW_PATH + "*")
tc.assertEqual(len(dfv), 61_689_822)
CPU times: total: 328 ms
Wall time: 339 ms
# dfv.info()
dfv.get_column_names()
['a17',
 'a88',
 'contrat_travail',
 'cris',
 'eff_3112',
 'motifcdd',
 'net',
 'pepa',
 's_brut',
 'treffect']
categorical = ["A17", "CONTRAT_TRAVAIL", "TREFFECT"]
for col_maj in tqdm(["CONTRAT_TRAVAIL", "TREFFECT"]):
    df = pd.read_csv(
        r"C:\Users\Public\Documents\TRAVAIL\agregats\data\DADS\count_2020_"
        + col_maj
        + ".csv",
        dtype={col_maj: str},
    )
    col = col_maj.lower()
    secondary = "pepa"
    nb_quantile = 100
    for cat in df[col_maj].tolist():
        # a == a is False if NaN
        if cat == cat:
            vdf_cat = dfv[dfv[col] == str(cat)]
            # print(cat, vdf_cat.count())
            q = Quantile(vdf_cat[secondary].tolist())
            q_dict = q.get_quantile(nb_quantile)
            # keep_upper_bound_secret(q_dict)
            with open(
                f"{OUT_PATH}/quantile_DADS_{col}_{cat}_{nb_quantile}_{year}_{col}.json",
                "w",
            ) as f:
                f.write(json.dumps(q_dict))