--- title: Démo et développement du script de calibration keywords: fastai sidebar: home_sidebar nb_path: "notebooks/retraitement_erfs-fpr/modules/calibration_01_dev_cal.ipynb" ---
import copy
# Imports du notebook (non exportés)
import json
import numpy as np
from leximpact_prepare_data.calib_and_copules import reduce_bucket_number
from leximpact_prepare_data.enlargement import enlarge
from leximpact_prepare_data.toolbase import compute_var_in_ff, create_simulation
annee_de_calcul = config.get("YEAR_ERFS")
annee_erfs = config.get("YEAR_ERFS")
annee_pote = config.get("YEAR_ERFS")
from leximpact_prepare_data.calib_and_copules import (
get_calib,
get_fake_data,
pandas_to_vaex,
)
# CALIB2
calib2 = {
"lower_bound": 0.0,
"upper_bound": 1000000000000000,
"nb_foyer": {"zero": 10, "nonzero": 87},
"buckets": [
{
"seuil_var_inf": 0,
"seuil_var_supp": 0,
"nombre_ff_tranche": 5,
"sum_tranche_var": 0,
"mean_tranche_var": 0,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 0,
"seuil_var_supp": 1,
"nombre_ff_tranche": 5,
"sum_tranche_var": 0,
"mean_tranche_var": 0,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 1,
"seuil_var_supp": 100,
"nombre_ff_tranche": 2,
"sum_tranche_var": 70,
"mean_tranche_var": 35,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 100,
"seuil_var_supp": 200,
"nombre_ff_tranche": 6,
"sum_tranche_var": 1030, # Quasi egal à la somme = 1049
"mean_tranche_var": 1030 / 6,
"stdev_tranche_var": 0,
"nb_above_seuil": 0,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 200,
"seuil_var_supp": 2000,
"nombre_ff_tranche": 19,
"sum_tranche_var": 6500,
"mean_tranche_var": 342,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 2000,
"seuil_var_supp": 10000,
"nombre_ff_tranche": 6,
"sum_tranche_var": 8000,
"mean_tranche_var": 1333,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 10000,
"seuil_var_supp": 25000,
"nombre_ff_tranche": 3,
"sum_tranche_var": 24350,
"mean_tranche_var": 12000,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
{
"seuil_var_inf": 25000,
"seuil_var_supp": 50000,
"nombre_ff_tranche": 2,
"sum_tranche_var": 90_000,
"mean_tranche_var": 40000,
"stdev_tranche_var": 0,
"nb_above_seuil": 87,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
},
],
}
# ERFS_FF2
erfs_ff2 = None
erfs_ff2 = pd.DataFrame(
[
[1, 1, 0],
[2, 1, 0],
[3, 1, 0], # Bucket de 3 foyers à zéro
[4, 1, 46],
[5, 1, 99],
[6, 1, 90], # Bucket de 2 foyers > sum_min
[7, 1, 250],
[8, 1, 300],
[9, 1, 1000], # Bucket de 3 foyers < sum_max
[10, 1, 21_000],
[11, 1, 12_000], # Bucket de 2 foyers à calib
[12, 1, 100],
[13, 1, 199],
[14, 1, 180],
[15, 1, 185],
[16, 1, 198],
[17, 1, 187], # Bucket de err<err_max
[18, 1, 26_000],
[19, 1, 49_000], # Calib itérative
],
columns=["idfoy", "wprm", "rfr"],
)
# erfs_ff2
erfs_ff2["quifoy"] = 0
erfs_ff2["idfam"] = 0
erfs_ff2["idmen"] = 0
erfs_ff2 = enlarge(erfs_ff2, 3)
erfs_ff2test = erfs_ff2.copy()
erfs_02 = pd.read_hdf(
config.get("DATA_OUT") + "02_erfs_enlarged_ind" + config.get("YEAR_ERFS") + ".h5"
)
len(erfs_02) # EN INDIVIDUS !
erfs_02.columns
erfs_02 = erfs_02.fillna(0)
tc.assertEqual(erfs_02.isna().sum().sum(), 0)
erfs_02 = erfs_02.sample(n=100000, random_state=35) # 40
from openfisca_france import FranceTaxBenefitSystem
TBS = FranceTaxBenefitSystem()
my_simu, dico = create_simulation(data=erfs_02, tbs=TBS, period=annee_de_calcul)
var_list = ["rfr"]
cols_declarant_principal = ["rfr"]
print(erfs_02.columns)
erfs_03 = compute_var_in_ff(
my_simu, annee_de_calcul, erfs_02, var_list, cols_declarant_principal
)
print("Nombre d'individus : ", len(erfs_03))
erfs_03
print("Max : ", erfs_03["rfr"].max())
print("Min : ", erfs_03["rfr"].min())
print("Somme pondérée : ", (erfs_03["rfr"] * erfs_03["wprm"]).sum())
erfs_03["wprm"] = 1 # Pour que chaque foyer ait un poids de 1
erfs_03["revkire"] = erfs_03["rfr"]
erfs_03 = erfs_03.sort_values(by="revkire")
# erfs_03.columns
# Génération de la fake calib
vaex_df = pandas_to_vaex(erfs_03)
calib = get_calib(vaex_df, "revkire", 100)
erfs_03 = erfs_03.drop(["revkire"], axis=1)
erfs_03.columns
# calib["buckets"][2]
calib["buckets"].append(
{
"seuil_var_inf": 953_614.5,
"seuil_var_supp": 3_250_000, # Inclus le max de la base ERFS
"nombre_ff_tranche": 21,
"sum_tranche_var": 21 * 1_500_000,
"mean_tranche_var": 1_500_000,
"stdev_tranche_var": 0,
"nb_above_seuil": 0,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
}
)
calib["buckets"].append(
{
"seuil_var_inf": 3_250_000,
"seuil_var_supp": 4_100_000,
"nombre_ff_tranche": 6,
"sum_tranche_var": 6 * 4_000_000,
"mean_tranche_var": 4_000_000,
"stdev_tranche_var": 0,
"nb_above_seuil": 0,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
}
)
calib["buckets"].append(
{
"seuil_var_inf": 4_100_000,
"seuil_var_supp": 5_000_000,
"nombre_ff_tranche": 2,
"sum_tranche_var": 2 * 4_200_000,
"mean_tranche_var": 4_200_000,
"stdev_tranche_var": 0,
"nb_above_seuil": 0,
"sum_var_above_seuil": 0,
"ratio_nb_above_seuil": 0,
"mean_var_above_seuil": 0,
}
)
# erfs_03.tail()
print("Sum avant :", erfs_03["rfr"].sum())
randomizing = lambda row: row * (np.random.rand() + 0.5) # *5
erfs_03["rfr"] = erfs_03["rfr"].apply(randomizing)
print("Sum apres :", erfs_03["rfr"].sum())
# erfs_03.tail()
erfs_03 = enlarge(erfs_03, 3)
erfs_03.columns
erfs_03.head()
erfs_03_ff = individus_to_foyers_fiscaux(erfs_03)
tc.assertEqual(erfs_03["idfoy"].nunique(), erfs_03_ff["idfoy"].nunique())
# erfs_03_ff.head()
print("Nombre de foyers : ", len(erfs_03_ff))
print("Somme des poids : ", erfs_03_ff["wprm"].sum())
# Pour limiter les calculs, on ne garde que les variables dont on a besoin
erfs_03_ff = erfs_03_ff[["idfoy", "wprm", "fake_id", "rfr"]]
erfs_test = erfs_03_ff.copy(deep=True) # Pour s'en servir dans d'autres tests
erfs_ind = erfs_03.copy(deep=True)
calib_test = calib.copy()
print(
"Gens de rfr nul dans la base ERFS : ",
erfs_03_ff[erfs_03_ff["rfr"] < calib["buckets"][1]["seuil_var_inf"]]["wprm"].sum(),
)
var_name = "rfr"
# erfs = erfs_ff2
# calib = calib2
erfs_ff2.head()
calib2["buckets"][5]
len(calib2["buckets"])
# calib = calib_test
erfs_test.head()
calib_test["buckets"][8]
def test_get_minimal_frontiers(calib, erfs, var_name):
calibT, frontieres_varT = get_minimal_frontiers(calib, erfs, var_name)
# On a autant ou moins de buckets
assert len(calib["buckets"]) >= len(calibT["buckets"])
# On a bien recoupé le fichier calib en un nombre de buckets égal au nombre de frontieres-1
assert len(calibT["buckets"]) == len(frontieres_varT) - 1
# Les frontieres sont bien distinctes
assert len(set(frontieres_varT)) == len(frontieres_varT)
return calibT, frontieres_varT
# Données minimales
calibT1, frontieres_varT1 = test_get_minimal_frontiers(calib2, erfs_ff2, var_name)
assert frontieres_varT1 == [0, 1.0, 100, 200, 2000, 10000, 25000, 50001]
assert calibT1["buckets"][0]["nombre_ff_tranche"] == 10 # Bonne fusion des buckets
# Base réduite
calibT2, frontieres_varT = test_get_minimal_frontiers(calib_test, erfs_test, var_name)
assert len(calibT2["buckets"]) == 105
def test_get_minimal_frontiers2(base):
calib3 = {
"lower_bound": 0,
"upper_bound": 1000000000000000,
"nb_foyer": {"zero": 23433471, "nonzero": 15831225},
"buckets": [
{
"seuil_var_inf": 0,
"seuil_var_supp": 0,
"nombre_ff_tranche": 23433471,
"sum_tranche_var": 0,
"mean_tranche_var": 0,
"nb_above_seuil": 15831225,
"sum_var_above_seuil": 89542579670.09967,
"ratio_nb_above_seuil": 0.40319234866863607,
"mean_var_above_seuil": 5656.073972172063,
},
{
"seuil_var_inf": 0.7,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158290.1,
"mean_tranche_var": 0.9998616655717824,
"nb_above_seuil": 15672913,
"sum_var_above_seuil": 89542421379.99966,
"ratio_nb_above_seuil": 0.3991604315489925,
"mean_var_above_seuil": 5713.195841768513,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 15514601,
"sum_var_above_seuil": 89542263067.99966,
"ratio_nb_above_seuil": 0.39512851442934893,
"mean_var_above_seuil": 5771.483460515657,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 15356289,
"sum_var_above_seuil": 89542104755.99966,
"ratio_nb_above_seuil": 0.39109659730970536,
"mean_var_above_seuil": 5830.9728838783685,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158313,
"sum_tranche_var": 158313.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 15197976,
"sum_var_above_seuil": 89541946442.99966,
"ratio_nb_above_seuil": 0.3870646547218906,
"mean_var_above_seuil": 5891.70205578688,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 15039664,
"sum_var_above_seuil": 89541788130.99966,
"ratio_nb_above_seuil": 0.38303273760224704,
"mean_var_above_seuil": 5953.709346897621,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 14881352,
"sum_var_above_seuil": 89541629818.99966,
"ratio_nb_above_seuil": 0.37900082048260353,
"mean_var_above_seuil": 6017.035939946832,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 14723040,
"sum_var_above_seuil": 89541471506.99966,
"ratio_nb_above_seuil": 0.37496890336295996,
"mean_var_above_seuil": 6081.724392992185,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158313,
"sum_tranche_var": 158313.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 14564727,
"sum_var_above_seuil": 89541313193.99966,
"ratio_nb_above_seuil": 0.3709369607751452,
"mean_var_above_seuil": 6147.819536473266,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 14406415,
"sum_var_above_seuil": 89541154881.99966,
"ratio_nb_above_seuil": 0.36690504365550164,
"mean_var_above_seuil": 6215.366896066764,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 1.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 158312.0,
"mean_tranche_var": 1.0,
"nb_above_seuil": 14248103,
"sum_var_above_seuil": 89540996569.99966,
"ratio_nb_above_seuil": 0.36287312653585807,
"mean_var_above_seuil": 6284.415305672598,
},
{
"seuil_var_inf": 1.0,
"seuil_var_supp": 2.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 311850.6,
"mean_tranche_var": 1.9698481479609882,
"nb_above_seuil": 14089791,
"sum_var_above_seuil": 89540684719.39966,
"ratio_nb_above_seuil": 0.3588412094162145,
"mean_var_above_seuil": 6355.004465247189,
},
{
"seuil_var_inf": 2.0,
"seuil_var_supp": 2.0,
"nombre_ff_tranche": 158313,
"sum_tranche_var": 316626.0,
"mean_tranche_var": 2.0,
"nb_above_seuil": 13931478,
"sum_var_above_seuil": 89540368093.39966,
"ratio_nb_above_seuil": 0.35480926682839975,
"mean_var_above_seuil": 6427.198039820301,
},
{
"seuil_var_inf": 2.0,
"seuil_var_supp": 2.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 316624.0,
"mean_tranche_var": 2.0,
"nb_above_seuil": 13773166,
"sum_var_above_seuil": 89540051469.39966,
"ratio_nb_above_seuil": 0.3507773497087562,
"mean_var_above_seuil": 6501.050772886906,
},
{
"seuil_var_inf": 2.0,
"seuil_var_supp": 2.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 316624.0,
"mean_tranche_var": 2.0,
"nb_above_seuil": 13614854,
"sum_var_above_seuil": 89539734845.39966,
"ratio_nb_above_seuil": 0.3467454325891126,
"mean_var_above_seuil": 6576.621008598378,
},
{
"seuil_var_inf": 2.0,
"seuil_var_supp": 2.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 316624.0,
"mean_tranche_var": 2.0,
"nb_above_seuil": 13456542,
"sum_var_above_seuil": 89539418221.39966,
"ratio_nb_above_seuil": 0.34271351546946904,
"mean_var_above_seuil": 6653.969364596021,
},
{
"seuil_var_inf": 2.0,
"seuil_var_supp": 3.0,
"nombre_ff_tranche": 158313,
"sum_tranche_var": 467797.4,
"mean_tranche_var": 2.954889364739472,
"nb_above_seuil": 13298229,
"sum_var_above_seuil": 89538950423.99966,
"ratio_nb_above_seuil": 0.3386815728816543,
"mean_var_above_seuil": 6733.14848345593,
},
{
"seuil_var_inf": 3.0,
"seuil_var_supp": 3.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 474936.0,
"mean_tranche_var": 3.0,
"nb_above_seuil": 13139917,
"sum_var_above_seuil": 89538475487.99966,
"ratio_nb_above_seuil": 0.3346496557620107,
"mean_var_above_seuil": 6814.234480172109,
},
{
"seuil_var_inf": 3.0,
"seuil_var_supp": 3.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 474936.0,
"mean_tranche_var": 3.0,
"nb_above_seuil": 12981605,
"sum_var_above_seuil": 89538000551.99966,
"ratio_nb_above_seuil": 0.33061773864236715,
"mean_var_above_seuil": 6897.2981809259845,
},
{
"seuil_var_inf": 3.0,
"seuil_var_supp": 4.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 594951.2,
"mean_tranche_var": 3.7580928798827626,
"nb_above_seuil": 12823293,
"sum_var_above_seuil": 89537405600.79965,
"ratio_nb_above_seuil": 0.32658582152272364,
"mean_var_above_seuil": 6982.403474739262,
},
{
"seuil_var_inf": 4.0,
"seuil_var_supp": 4.0,
"nombre_ff_tranche": 158313,
"sum_tranche_var": 633252.0,
"mean_tranche_var": 4.0,
"nb_above_seuil": 12664980,
"sum_var_above_seuil": 89536772348.79967,
"ratio_nb_above_seuil": 0.32255387893490883,
"mean_var_above_seuil": 7069.633931423474,
},
{
"seuil_var_inf": 4.0,
"seuil_var_supp": 5.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 731992.6000000001,
"mean_tranche_var": 4.6237341452322,
"nb_above_seuil": 12506668,
"sum_var_above_seuil": 89536040356.19966,
"ratio_nb_above_seuil": 0.3185219618152653,
"mean_var_above_seuil": 7159.064297237254,
},
{
"seuil_var_inf": 5.0,
"seuil_var_supp": 6.0,
"nombre_ff_tranche": 158312,
"sum_tranche_var": 811433.2999999995,
"mean_tranche_var": 5.125532492799026,
"nb_above_seuil": 12348356,
"sum_var_above_seuil": 89535228922.89967,
"ratio_nb_above_seuil": 0.31449004469562175,
"mean_var_above_seuil": 7250.781312338231,
},
],
}
tot_av = 0
for i in range(len(calib3["buckets"])):
tot_av += calib3["buckets"][i]["nombre_ff_tranche"]
calib3p, front3 = get_minimal_frontiers(calib3, base, "rfr")
tot_ap = 0
for i in range(len(calib3p["buckets"])):
tot_ap += calib3p["buckets"][i]["nombre_ff_tranche"]
print(front3)
print(calib3p)
tc.assertEqual(tot_av, tot_ap)
# Base réduite
test_get_minimal_frontiers2(erfs_test)
calib_test["buckets"][1]
def test_nb_zero(erfs, var_name, calib):
nb_zero_erfs, nb_zero_pote = nb_zero(erfs, var_name, calib)
assert nb_zero_erfs >= 41127
assert nb_zero_pote == 61817
test_nb_zero(erfs_test, var_name, calib_test)
def test_distrib_to_quantiles(erfs_ff, var_name, calib):
Distrib_ERFST, Distrib_POTET, calibT = distrib_to_quantiles(
erfs_ff, var_name, calib
)
return Distrib_ERFST, Distrib_POTET, calibT
# Données minimales
Distrib_ERFST, Distrib_POTET, calibT = test_distrib_to_quantiles(
erfs_ff2, var_name, calib2
)
assert Distrib_POTET.df.iloc[0]["nb_ff"] == 10
assert Distrib_POTET.df.iloc[4]["sum"] == 8000
assert Distrib_ERFST.df.iloc[4]["sum"] == 0
assert Distrib_ERFST.df.iloc[5]["nb_ff"] == 2
assert Distrib_ERFST.df.iloc[6]["sum"] == 75000
# Base réduite (On ne peut pas mettre de valeurs car il y a une création random)
Distrib_ERFST, Distrib_POTET, calibT = test_distrib_to_quantiles(
erfs_test, var_name, calib_test
)
sampleT = Distrib_ERFST.bucket_list["6"].sample
assert Distrib_ERFST.df.iloc[6]["sum"] == (sampleT["wprm"] * sampleT[var_name]).sum()
# assert Distrib_ERFST.df.iloc[104]["sum"] == 0 # Bucket vide
assert (
Distrib_ERFST.bucket_list["97"].seuil_max == Distrib_ERFST.df.iloc[97]["seuil_max"]
)
Distrib_ERFS2, Distrib_POTE2, calib = distrib_to_quantiles(erfs_ff2, "rfr", calib2)
tracker2 = Tracker(frontieres_varT1)
trackerT = Tracker(frontieres_varT)
Distrib_ERFST, Distrib_POTET, calib = distrib_to_quantiles(erfs_test, "rfr", calib_test)
fig3, error_avant_calib, final_error = compare_distributions(
Distrib_ERFST.df,
Distrib_POTET.df,
"rfr",
annee_erfs,
annee_pote,
log=False,
title_suffix="avant_calib",
df_cal=[],
)
def test_seuil_chomage(calib_in):
m = len(calib_in["buckets"])
calib_out = seuil_chomage(calib_in)
tc.assertEqual(m, len(calib_in["buckets"]))
tc.assertGreaterEqual(len(calib), len(calib_out))
return calib_out
calib_out = test_seuil_chomage(calib_test)
# calib_out
tc.assertEqual(len(calib_out["buckets"]), 100)
tc.assertEqual(calib_out["buckets"][-1]["seuil_var_supp"], 365 * 241.22)
def test_save_made_up_people(erfs_ind):
new_people_ind = erfs_ind[-4:]
# print(new_people_ind)
erfs_smup = save_made_up_people(erfs_ind, new_people_ind)
return erfs_smup
erfs_test = erfs_test.sort_values(by="idfoy")
erfs_ind = erfs_ind.sort_values(by="idfoy")
erfs_ind.tail()
erfs_smup = test_save_made_up_people(erfs_ind)
erfs_smup[-10:]
def test_ajout_gens_en_haut(
erfs, erfs_ind, var_name, Distrib_POTE, Distrib_ERFS, calib
):
erfsT, _ = ajout_gens_en_haut(erfs, erfs_ind, var_name, Distrib_POTE, Distrib_ERFS)
print("On a ajouté ", len(erfsT) - len(erfs), " foyers dans la base")
assert len(erfsT) >= len(erfs)
assert len(erfsT) == erfsT["idfoy"].nunique()
# print(erfsT.tail())
# On recalcule les quantiles
Distrib_ERFS, Distrib_POTE, calib_final = distrib_to_quantiles(
erfsT, var_name, calib
)
# print(Distrib_ERFS.df['nb_ff'])
assert Distrib_ERFS.df[Distrib_ERFS.df["nb_ff"] == 0].empty is True
# On observe l'ajout de gens
figure, error_avant_calib, final_error_av = compare_distributions(
Distrib_ERFS.df,
Distrib_POTE.df,
var_name,
annee_erfs,
annee_pote,
log=False,
title_suffix="avant_calibration",
df_cal=[],
)
return erfsT, figure
# Données réduites
# Distrib_POTET.df["seuil_inf"]
# Distrib_ERFST.df["seuil_inf"]
_ = test_ajout_gens_en_haut(
erfs_test, erfs_ind, var_name, Distrib_POTET, Distrib_ERFST, calib_test
)
def test_merge_and_replace():
# Data
base = pd.DataFrame(
[
[11, 0, 0],
[12, 1, 0],
[13, 1, 0],
[4, 1, 46],
[5, 1, 99],
[6, 1, 90],
[7, 1, 250],
[8, 1, 300],
[9, 1, 1000],
],
columns=["idfoy", "wprm", "var_cal"],
)
sample = pd.DataFrame(
[
[11, 1, 0],
[12, 1, 101],
[4, 1, 46],
[5, 1, 101],
[7, 0, 250],
[8, 1, 101],
[78, 1, 101],
],
columns=["idfoy", "wprm", "var_cal"],
)
base = merge_and_replace(base, sample, ["var_cal"])
assert float(base[base["idfoy"] == 12]["var_cal"]) == 101 # On update la base
assert float(base[base["idfoy"] == 5]["var_cal"]) == 101 # On update la base
assert (
float(base[base["idfoy"] == 13]["var_cal"]) == 0
) # Ceux hors sample ne changent pas
assert (
float(base[base["idfoy"] == 11]["wprm"]) == 0
) # On ne touche qu'à la colonne d'interet
assert (
base[base["idfoy"] == 78].empty is True
) # On n'ajoute pas les nouveaux foyers
test_merge_and_replace()
def test_calib_empty_bucket():
tracker = Tracker([1, 3, 5, 14])
tracker = calib_empty_bucket(2, tracker)
assert tracker.df["final_error"].iloc[2] == 1
assert tracker.df["error_type"][2] == "empty"
test_calib_empty_bucket()
def test_compute_min_max_calib(var_name, bucket):
not_fake_sumT = (
bucket.sample[bucket.sample["fake_id"] == 0][var_name]
* bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
).sum()
sample_fakeT = bucket.sample[bucket.sample["fake_id"] == 1].copy()
min_possibleT, max_possibleT, var_minT, var_maxT = compute_min_max_calib(
var_name, bucket
)
assert min_possibleT == (var_minT * sample_fakeT["wprm"]).sum() + not_fake_sumT
assert max_possibleT == (var_maxT * sample_fakeT["wprm"]).sum() + not_fake_sumT
return min_possibleT, max_possibleT, var_minT, var_maxT
_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["3"])
_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["51"])
_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["89"])
# Données Minimales
min_possibleT, max_possibleT, var_minT, var_maxT = test_compute_min_max_calib(
var_name, Distrib_ERFS2.bucket_list["5"]
)
assert var_minT == 10_000
assert var_maxT == 25_000 - 1
assert min_possibleT > 10_000 * Distrib_ERFS2.bucket_list["5"].nb_ff
assert max_possibleT < 25_000 * Distrib_ERFS2.bucket_list["5"].nb_ff
erfs_cal = individus_to_foyers_fiscaux(erfs_ind)
def test_init_calib_zero(var_name, erfs_ind, bucket_erfs, Distrib_POTE, tracker):
erfs_ind["var_cal"] = 0.1
tracker, erfs_cal_ind = init_calib_zero(
var_name, erfs_ind, bucket_erfs, Distrib_POTE.bucket_list[str(0)], tracker
)
erfss_cal = individus_to_foyers_fiscaux(erfs_cal_ind)
nb_zero = Distrib_POTE.bucket_list["0"].nb_ff
print(
"Nb de gens à zéro post calib : ",
round(erfss_cal[erfss_cal["var_cal"] == 0]["wprm"].sum(), 0),
"\n",
)
tc.assertEqual(
round(erfss_cal[erfss_cal["var_cal"] == 0]["wprm"].sum(), 0), nb_zero
)
test_init_calib_zero(
var_name,
erfs_ff2test,
Distrib_ERFS2.bucket_list["0"],
Distrib_POTE2,
tracker2,
)
test_init_calib_zero(
var_name,
erfs_ind,
Distrib_ERFST.bucket_list["0"],
Distrib_POTET,
trackerT,
)
print(Distrib_POTE2.bucket_list["0"].nb_ff)
print(Distrib_POTE2.bucket_list["0"].seuil_inf)
print(Distrib_POTE2.bucket_list["0"].seuil_max)
def test_amelioration_quantile(i, tracker, Distrib_ERFS, Distrib_POTE):
# On teste sur le bucket 2 des données minimales
bucket = Distrib_ERFS.bucket_list[str(i)]
sample_fake_in = bucket.sample[bucket.sample["fake_id"] == 1]
sample_fake_in.loc[:, "var_cal"] = 0.1
tracker.df["wanted"][str(i)] = Distrib_POTE.bucket_list[str(i)].sum_
not_fake_sum = (
bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
* bucket.sample[bucket.sample["fake_id"] == 0]["rfr"]
).sum()
error_in = (
(bucket.sample["rfr"] * bucket.sample["wprm"]).sum()
- Distrib_POTE.bucket_list[str(i)].sum_
) / Distrib_POTE.bucket_list[str(i)].sum_
changing_var_name = "rfr"
# Test
print("Error before", error_in)
improvement, current_sum, error, sample_fake_en_cours = amelioration_quantile(
i, tracker, sample_fake_in, not_fake_sum, error_in, changing_var_name
)
print("Final error", error)
assert abs(error) < abs(error_in)
test_amelioration_quantile(2, tracker2, Distrib_ERFS2, Distrib_POTE2)
test_amelioration_quantile(72, trackerT, Distrib_ERFST, Distrib_POTET)
def test_optimization_quantile(erfss, i, tracker, var_name, Distrib_ERFS, Distrib_POTE):
# On teste sur le bucket 2 des données minimales
err_max = 0.01
bucket = Distrib_ERFS.bucket_list[str(i)]
sample_fake = bucket.sample[bucket.sample["fake_id"] == 1]
sample_fake.loc[:, "var_cal"] = 0.1
tracker.df["wanted"][str(i)] = Distrib_POTE.bucket_list[str(i)].sum_
not_fake_sum = (
bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
* bucket.sample[bucket.sample["fake_id"] == 0]["rfr"]
).sum()
error_in = (
(bucket.sample["rfr"] * bucket.sample["wprm"]).sum()
- Distrib_POTE.bucket_list[str(i)].sum_
) / Distrib_POTE.bucket_list[str(i)].sum_
changing_var_name = "rfr"
print("Erreur avant toute calibration", error_in)
erfss, tracker, sample_fake = optimization_quantile(
erfss,
i,
tracker,
sample_fake,
not_fake_sum,
error_in,
changing_var_name,
err_max,
)
error = (
not_fake_sum
+ (sample_fake["wprm"] * sample_fake["var_cal"]).sum()
- tracker.df["wanted"][str(i)]
) / tracker.df["wanted"][str(i)]
print("final opt error", error)
assert abs(error) < abs(error_in)
test_optimization_quantile(erfs_ff2, 2, tracker2, "rfr", Distrib_ERFS2, Distrib_POTE2)
test_optimization_quantile(erfs_test, 78, trackerT, "rfr", Distrib_ERFST, Distrib_POTET)
err_max = 0.01
erfs_02_ind = erfs_ff2test
tracker2, erfs_cal_ff2, erfs_cal_ind2 = calibration_quantiles(
var_name, erfs_02_ind, Distrib_ERFS2, Distrib_POTE2, err_max
)
erfs_cal_ff2.head()
var_name = "rfr"
err_max = 0.01
trackerT, erfs_cal_test, _ = calibration_quantiles(
var_name, erfs_ind, Distrib_ERFST, Distrib_POTET, err_max
)
Distrib_CALT, Distrib_POTET, calib = distrib_to_quantiles(
erfs_cal_test, "rfr", calib_test
)
fig4, error_post_calib, final_error = compare_distributions(
Distrib_ERFST.df,
Distrib_POTET.df,
"rfr",
annee_erfs,
annee_pote,
log=False,
title_suffix="_N_3",
df_cal=Distrib_CALT.df,
)
Distrib_CALT.df
# On fait un test
var_name = "chomage_brut"
erfs_cal_ff, erfs_cal_ind, Distribs, fig_var_cal = calibration(
erfs_ind, var_name, annee_erfs, annee_pote, calib=calib_test
)
fig_var_cal
plt.show()
erfs_cal_ff.columns
erfs_cal_ind.columns