--- title: Démo et développement du script de calibration keywords: fastai sidebar: home_sidebar nb_path: "notebooks/retraitement_erfs-fpr/modules/calibration_01_dev_cal.ipynb" ---
{% raw %}
{% endraw %}

Imports

{% raw %}
{% endraw %} {% raw %}
import copy

# Imports du notebook (non exportés)
import json

import numpy as np

from leximpact_prepare_data.calib_and_copules import reduce_bucket_number
from leximpact_prepare_data.enlargement import enlarge
from leximpact_prepare_data.toolbase import compute_var_in_ff, create_simulation
{% endraw %} {% raw %}
annee_de_calcul = config.get("YEAR_ERFS")
annee_erfs = config.get("YEAR_ERFS")
annee_pote = config.get("YEAR_ERFS")
{% endraw %}

[DEV] Fonctions développées pour la calibration

Création de quantiles

seuil_chomage()

{% raw %}

seuil_chomage[source]

seuil_chomage(calib_in)

{% endraw %} {% raw %}
{% endraw %}

get_minimal_frontiers()

{% raw %}

get_minimal_frontiers[source]

get_minimal_frontiers(calib, erfs, var_name)

{% endraw %} {% raw %}
{% endraw %}

nb_zero()

{% raw %}

nb_zero[source]

nb_zero(erfs_ff, var_name, calib)

{% endraw %} {% raw %}
{% endraw %}

Class DF_quantiles

{% raw %}

class DF_quantiles[source]

DF_quantiles(frontiers)

{% endraw %} {% raw %}
{% endraw %}

Class Bucket

{% raw %}

class Bucket_Base[source]

Bucket_Base(i, frontiers, erfs_ff, var_name)

{% endraw %} {% raw %}

class Bucket_Pote[source]

Bucket_Pote(i, frontiers, calib)

{% endraw %} {% raw %}
{% endraw %}

distrib_to_quantiles()

{% raw %}

distrib_to_quantiles[source]

distrib_to_quantiles(erfs_ff, var_name, calib)

{% endraw %} {% raw %}
{% endraw %}

Plot & Compare quantiles

generate_title()

{% raw %}

generate_title[source]

generate_title(var_name, annee_erfs, annee_pote, title_suffix, log=False, cal=False)

{% endraw %} {% raw %}
{% endraw %}

compare_distributions()

{% raw %}

compare_distributions[source]

compare_distributions(df_erfs, df_pote, var_name, annee_erfs, annee_pote, log, title_suffix, df_cal=[])

{% endraw %} {% raw %}
{% endraw %}

Calibration spécifique aux derniers buckets (ajout de gens en haut de la distribution)

ajout_gens_en_haut()

{% raw %}

ajout_gens_en_haut[source]

ajout_gens_en_haut(erfs, erfs_ind, var_name, Distrib_POTE, Distrib_ERFS)

{% endraw %} {% raw %}
{% endraw %}

create_ppl_HR()

{% raw %}

create_ppl_HR[source]

create_ppl_HR(erfs_ind, var_name, list_missing, Distrib_POTE, Distrib_ERFS)

{% endraw %} {% raw %}
{% endraw %}

save_made_up_people()

{% raw %}

save_made_up_people[source]

save_made_up_people(erfs, new_people)

{% endraw %} {% raw %}
{% endraw %}

ToolBase

Class Tracker

{% raw %}

class Tracker[source]

Tracker(frontiers)

{% endraw %} {% raw %}
{% endraw %}

merge_and_replace()

{% raw %}

merge_and_replace[source]

merge_and_replace(erfss, sample_to_merge, var_list)

{% endraw %} {% raw %}
{% endraw %}

prepare_for_calib()

{% raw %}

prepare_for_calib[source]

prepare_for_calib(erfs, var_name)

{% endraw %} {% raw %}
{% endraw %}

compute_min_max_calib()

{% raw %}

compute_min_max_calib[source]

compute_min_max_calib(var_name, bucket)

{% endraw %} {% raw %}
{% endraw %}

data_avant_calib()

{% raw %}

data_avant_calib[source]

data_avant_calib(bucket, tracker, var_name, wanted)

{% endraw %} {% raw %}
{% endraw %}

recalage_calib() - Non utilisée ici

{% raw %}

recalage_calib[source]

recalage_calib(erfs_cal, var_name, sum_wanted)

{% endraw %} {% raw %}
{% endraw %}

Calibration spécifique au 1er bucket (ajout de gens)

init_calib_zero()

{% raw %}

init_calib_zero[source]

init_calib_zero(var_name, erfs_ind, bucket_erfs, bucket_pote, tracker)

{% endraw %} {% raw %}
{% endraw %}

create_ppl_zero()

{% raw %}

create_ppl_zero[source]

create_ppl_zero(sample_zero, Nb_wanted, Nb_actuel, max_idfoy)

Ajout de foyers de rfr nul On notera qu'on ajoute uniquement des déclarants principaux, donc ici foyer=gens NB: attention ici on ajoute pas mal de monde dans la population (mais ce sera régulé dans l'étape d'inflation_ff)

{% endraw %} {% raw %}
{% endraw %}

Types de calibration de buckets

calib_empty_bucket()

{% raw %}

calib_empty_bucket[source]

calib_empty_bucket(i, tracker)

{% endraw %} {% raw %}
{% endraw %}

calib_bucket_lim()

{% raw %}

calib_bucket_lim[source]

calib_bucket_lim(erfs, i, sample_fake, not_fake_sum, tracker, limit)

{% endraw %} {% raw %}
{% endraw %}

optimization_quantile()

{% raw %}

optimization_quantile[source]

optimization_quantile(erfss, i, tracker, sample_fake, not_fake_sum, error_in, var_name, err_max)

{% endraw %} {% raw %}
{% endraw %}

amelioration_quantile()

{% raw %}

amelioration_quantile[source]

amelioration_quantile(i, tracker, sample_fake_in, not_fake_sum, error_in, changing_var_name)

{% endraw %} {% raw %}
{% endraw %}

Calibration

calibration_quantiles()

{% raw %}

calibration_quantiles[source]

calibration_quantiles(var_name, erfs_ind, Distrib_ERFS, Distrib_POTE, err_max)

{% endraw %} {% raw %}
{% endraw %}

calibration()

{% raw %}

calibration[source]

calibration(erfs_to_cal_ind, var_name, annee_erfs, annee_pote, calib=None)

{% endraw %} {% raw %}
{% endraw %}

[TESTS] Création de fausses données

Exemple minimal

{% raw %}
from leximpact_prepare_data.calib_and_copules import (
    get_calib,
    get_fake_data,
    pandas_to_vaex,
)
{% endraw %} {% raw %}
# CALIB2
calib2 = {
    "lower_bound": 0.0,
    "upper_bound": 1000000000000000,
    "nb_foyer": {"zero": 10, "nonzero": 87},
    "buckets": [
        {
            "seuil_var_inf": 0,
            "seuil_var_supp": 0,
            "nombre_ff_tranche": 5,
            "sum_tranche_var": 0,
            "mean_tranche_var": 0,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 0,
            "seuil_var_supp": 1,
            "nombre_ff_tranche": 5,
            "sum_tranche_var": 0,
            "mean_tranche_var": 0,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 1,
            "seuil_var_supp": 100,
            "nombre_ff_tranche": 2,
            "sum_tranche_var": 70,
            "mean_tranche_var": 35,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 100,
            "seuil_var_supp": 200,
            "nombre_ff_tranche": 6,
            "sum_tranche_var": 1030,  # Quasi egal à la somme = 1049
            "mean_tranche_var": 1030 / 6,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 0,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 200,
            "seuil_var_supp": 2000,
            "nombre_ff_tranche": 19,
            "sum_tranche_var": 6500,
            "mean_tranche_var": 342,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 2000,
            "seuil_var_supp": 10000,
            "nombre_ff_tranche": 6,
            "sum_tranche_var": 8000,
            "mean_tranche_var": 1333,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 10000,
            "seuil_var_supp": 25000,
            "nombre_ff_tranche": 3,
            "sum_tranche_var": 24350,
            "mean_tranche_var": 12000,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
        {
            "seuil_var_inf": 25000,
            "seuil_var_supp": 50000,
            "nombre_ff_tranche": 2,
            "sum_tranche_var": 90_000,
            "mean_tranche_var": 40000,
            "stdev_tranche_var": 0,
            "nb_above_seuil": 87,
            "sum_var_above_seuil": 0,
            "ratio_nb_above_seuil": 0,
            "mean_var_above_seuil": 0,
        },
    ],
}
# ERFS_FF2
erfs_ff2 = None
erfs_ff2 = pd.DataFrame(
    [
        [1, 1, 0],
        [2, 1, 0],
        [3, 1, 0],  # Bucket de 3 foyers à zéro
        [4, 1, 46],
        [5, 1, 99],
        [6, 1, 90],  # Bucket de 2 foyers > sum_min
        [7, 1, 250],
        [8, 1, 300],
        [9, 1, 1000],  # Bucket de 3 foyers < sum_max
        [10, 1, 21_000],
        [11, 1, 12_000],  # Bucket de 2 foyers à calib
        [12, 1, 100],
        [13, 1, 199],
        [14, 1, 180],
        [15, 1, 185],
        [16, 1, 198],
        [17, 1, 187],  # Bucket de err<err_max
        [18, 1, 26_000],
        [19, 1, 49_000],  # Calib itérative
    ],
    columns=["idfoy", "wprm", "rfr"],
)
# erfs_ff2
{% endraw %} {% raw %}
erfs_ff2["quifoy"] = 0
erfs_ff2["idfam"] = 0
erfs_ff2["idmen"] = 0
erfs_ff2 = enlarge(erfs_ff2, 3)
erfs_ff2test = erfs_ff2.copy()
{% endraw %}

Base réduite

Imports

{% raw %}
erfs_02 = pd.read_hdf(
    config.get("DATA_OUT") + "02_erfs_enlarged_ind" + config.get("YEAR_ERFS") + ".h5"
)
len(erfs_02)  # EN INDIVIDUS !
erfs_02.columns
erfs_02 = erfs_02.fillna(0)
tc.assertEqual(erfs_02.isna().sum().sum(), 0)
erfs_02 = erfs_02.sample(n=100000, random_state=35)  # 40
{% endraw %}

Calcul du RFR

{% raw %}
from openfisca_france import FranceTaxBenefitSystem

TBS = FranceTaxBenefitSystem()
{% endraw %} {% raw %}
my_simu, dico = create_simulation(data=erfs_02, tbs=TBS, period=annee_de_calcul)
{% endraw %} {% raw %}
var_list = ["rfr"]
cols_declarant_principal = ["rfr"]
print(erfs_02.columns)
erfs_03 = compute_var_in_ff(
    my_simu, annee_de_calcul, erfs_02, var_list, cols_declarant_principal
)
print("Nombre d'individus : ", len(erfs_03))
erfs_03
{% endraw %}

Informations sur le RFR calculé

{% raw %}
print("Max : ", erfs_03["rfr"].max())
print("Min : ", erfs_03["rfr"].min())
print("Somme pondérée : ", (erfs_03["rfr"] * erfs_03["wprm"]).sum())
{% endraw %}

Création d'un fichier calib fake

{% raw %}
erfs_03["wprm"] = 1  # Pour que chaque foyer ait un poids de 1
erfs_03["revkire"] = erfs_03["rfr"]
erfs_03 = erfs_03.sort_values(by="revkire")
# erfs_03.columns

# Génération de la fake calib
vaex_df = pandas_to_vaex(erfs_03)
calib = get_calib(vaex_df, "revkire", 100)

erfs_03 = erfs_03.drop(["revkire"], axis=1)
erfs_03.columns

# calib["buckets"][2]
{% endraw %} {% raw %}
calib["buckets"].append(
    {
        "seuil_var_inf": 953_614.5,
        "seuil_var_supp": 3_250_000,  # Inclus le max de la base ERFS
        "nombre_ff_tranche": 21,
        "sum_tranche_var": 21 * 1_500_000,
        "mean_tranche_var": 1_500_000,
        "stdev_tranche_var": 0,
        "nb_above_seuil": 0,
        "sum_var_above_seuil": 0,
        "ratio_nb_above_seuil": 0,
        "mean_var_above_seuil": 0,
    }
)
calib["buckets"].append(
    {
        "seuil_var_inf": 3_250_000,
        "seuil_var_supp": 4_100_000,
        "nombre_ff_tranche": 6,
        "sum_tranche_var": 6 * 4_000_000,
        "mean_tranche_var": 4_000_000,
        "stdev_tranche_var": 0,
        "nb_above_seuil": 0,
        "sum_var_above_seuil": 0,
        "ratio_nb_above_seuil": 0,
        "mean_var_above_seuil": 0,
    }
)
calib["buckets"].append(
    {
        "seuil_var_inf": 4_100_000,
        "seuil_var_supp": 5_000_000,
        "nombre_ff_tranche": 2,
        "sum_tranche_var": 2 * 4_200_000,
        "mean_tranche_var": 4_200_000,
        "stdev_tranche_var": 0,
        "nb_above_seuil": 0,
        "sum_var_above_seuil": 0,
        "ratio_nb_above_seuil": 0,
        "mean_var_above_seuil": 0,
    }
)
{% endraw %} {% raw %}
# erfs_03.tail()
print("Sum avant :", erfs_03["rfr"].sum())

randomizing = lambda row: row * (np.random.rand() + 0.5)  # *5
erfs_03["rfr"] = erfs_03["rfr"].apply(randomizing)
print("Sum apres :", erfs_03["rfr"].sum())
# erfs_03.tail()
{% endraw %}

Conversion de la base ERFS-FPR en foyers fiscaux et enlargment

{% raw %}
erfs_03 = enlarge(erfs_03, 3)
erfs_03.columns
erfs_03.head()
{% endraw %} {% raw %}
erfs_03_ff = individus_to_foyers_fiscaux(erfs_03)
tc.assertEqual(erfs_03["idfoy"].nunique(), erfs_03_ff["idfoy"].nunique())
# erfs_03_ff.head()

print("Nombre de foyers : ", len(erfs_03_ff))
print("Somme des poids : ", erfs_03_ff["wprm"].sum())
{% endraw %} {% raw %}
# Pour limiter les calculs, on ne garde que les variables dont on a besoin
erfs_03_ff = erfs_03_ff[["idfoy", "wprm", "fake_id", "rfr"]]

erfs_test = erfs_03_ff.copy(deep=True)  # Pour s'en servir dans d'autres tests
erfs_ind = erfs_03.copy(deep=True)
calib_test = calib.copy()
{% endraw %} {% raw %}
print(
    "Gens de rfr nul dans la base ERFS : ",
    erfs_03_ff[erfs_03_ff["rfr"] < calib["buckets"][1]["seuil_var_inf"]]["wprm"].sum(),
)
{% endraw %}

[TESTS] Vérification des fonctions

{% raw %}
var_name = "rfr"

# erfs = erfs_ff2
# calib = calib2
erfs_ff2.head()
calib2["buckets"][5]
len(calib2["buckets"])
{% endraw %} {% raw %}
# calib = calib_test
erfs_test.head()
calib_test["buckets"][8]
{% endraw %}

Quantiles

test_get_minimal_frontiers()

{% raw %}
def test_get_minimal_frontiers(calib, erfs, var_name):

    calibT, frontieres_varT = get_minimal_frontiers(calib, erfs, var_name)
    # On a autant ou moins de buckets
    assert len(calib["buckets"]) >= len(calibT["buckets"])
    # On a bien recoupé le fichier calib en un nombre de buckets égal au nombre de frontieres-1
    assert len(calibT["buckets"]) == len(frontieres_varT) - 1
    # Les frontieres sont bien distinctes
    assert len(set(frontieres_varT)) == len(frontieres_varT)

    return calibT, frontieres_varT


# Données minimales
calibT1, frontieres_varT1 = test_get_minimal_frontiers(calib2, erfs_ff2, var_name)
assert frontieres_varT1 == [0, 1.0, 100, 200, 2000, 10000, 25000, 50001]
assert calibT1["buckets"][0]["nombre_ff_tranche"] == 10  # Bonne fusion des buckets
# Base réduite
calibT2, frontieres_varT = test_get_minimal_frontiers(calib_test, erfs_test, var_name)
assert len(calibT2["buckets"]) == 105
{% endraw %} {% raw %}
def test_get_minimal_frontiers2(base):

    calib3 = {
        "lower_bound": 0,
        "upper_bound": 1000000000000000,
        "nb_foyer": {"zero": 23433471, "nonzero": 15831225},
        "buckets": [
            {
                "seuil_var_inf": 0,
                "seuil_var_supp": 0,
                "nombre_ff_tranche": 23433471,
                "sum_tranche_var": 0,
                "mean_tranche_var": 0,
                "nb_above_seuil": 15831225,
                "sum_var_above_seuil": 89542579670.09967,
                "ratio_nb_above_seuil": 0.40319234866863607,
                "mean_var_above_seuil": 5656.073972172063,
            },
            {
                "seuil_var_inf": 0.7,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158290.1,
                "mean_tranche_var": 0.9998616655717824,
                "nb_above_seuil": 15672913,
                "sum_var_above_seuil": 89542421379.99966,
                "ratio_nb_above_seuil": 0.3991604315489925,
                "mean_var_above_seuil": 5713.195841768513,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 15514601,
                "sum_var_above_seuil": 89542263067.99966,
                "ratio_nb_above_seuil": 0.39512851442934893,
                "mean_var_above_seuil": 5771.483460515657,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 15356289,
                "sum_var_above_seuil": 89542104755.99966,
                "ratio_nb_above_seuil": 0.39109659730970536,
                "mean_var_above_seuil": 5830.9728838783685,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158313,
                "sum_tranche_var": 158313.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 15197976,
                "sum_var_above_seuil": 89541946442.99966,
                "ratio_nb_above_seuil": 0.3870646547218906,
                "mean_var_above_seuil": 5891.70205578688,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 15039664,
                "sum_var_above_seuil": 89541788130.99966,
                "ratio_nb_above_seuil": 0.38303273760224704,
                "mean_var_above_seuil": 5953.709346897621,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 14881352,
                "sum_var_above_seuil": 89541629818.99966,
                "ratio_nb_above_seuil": 0.37900082048260353,
                "mean_var_above_seuil": 6017.035939946832,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 14723040,
                "sum_var_above_seuil": 89541471506.99966,
                "ratio_nb_above_seuil": 0.37496890336295996,
                "mean_var_above_seuil": 6081.724392992185,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158313,
                "sum_tranche_var": 158313.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 14564727,
                "sum_var_above_seuil": 89541313193.99966,
                "ratio_nb_above_seuil": 0.3709369607751452,
                "mean_var_above_seuil": 6147.819536473266,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 14406415,
                "sum_var_above_seuil": 89541154881.99966,
                "ratio_nb_above_seuil": 0.36690504365550164,
                "mean_var_above_seuil": 6215.366896066764,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 1.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 158312.0,
                "mean_tranche_var": 1.0,
                "nb_above_seuil": 14248103,
                "sum_var_above_seuil": 89540996569.99966,
                "ratio_nb_above_seuil": 0.36287312653585807,
                "mean_var_above_seuil": 6284.415305672598,
            },
            {
                "seuil_var_inf": 1.0,
                "seuil_var_supp": 2.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 311850.6,
                "mean_tranche_var": 1.9698481479609882,
                "nb_above_seuil": 14089791,
                "sum_var_above_seuil": 89540684719.39966,
                "ratio_nb_above_seuil": 0.3588412094162145,
                "mean_var_above_seuil": 6355.004465247189,
            },
            {
                "seuil_var_inf": 2.0,
                "seuil_var_supp": 2.0,
                "nombre_ff_tranche": 158313,
                "sum_tranche_var": 316626.0,
                "mean_tranche_var": 2.0,
                "nb_above_seuil": 13931478,
                "sum_var_above_seuil": 89540368093.39966,
                "ratio_nb_above_seuil": 0.35480926682839975,
                "mean_var_above_seuil": 6427.198039820301,
            },
            {
                "seuil_var_inf": 2.0,
                "seuil_var_supp": 2.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 316624.0,
                "mean_tranche_var": 2.0,
                "nb_above_seuil": 13773166,
                "sum_var_above_seuil": 89540051469.39966,
                "ratio_nb_above_seuil": 0.3507773497087562,
                "mean_var_above_seuil": 6501.050772886906,
            },
            {
                "seuil_var_inf": 2.0,
                "seuil_var_supp": 2.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 316624.0,
                "mean_tranche_var": 2.0,
                "nb_above_seuil": 13614854,
                "sum_var_above_seuil": 89539734845.39966,
                "ratio_nb_above_seuil": 0.3467454325891126,
                "mean_var_above_seuil": 6576.621008598378,
            },
            {
                "seuil_var_inf": 2.0,
                "seuil_var_supp": 2.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 316624.0,
                "mean_tranche_var": 2.0,
                "nb_above_seuil": 13456542,
                "sum_var_above_seuil": 89539418221.39966,
                "ratio_nb_above_seuil": 0.34271351546946904,
                "mean_var_above_seuil": 6653.969364596021,
            },
            {
                "seuil_var_inf": 2.0,
                "seuil_var_supp": 3.0,
                "nombre_ff_tranche": 158313,
                "sum_tranche_var": 467797.4,
                "mean_tranche_var": 2.954889364739472,
                "nb_above_seuil": 13298229,
                "sum_var_above_seuil": 89538950423.99966,
                "ratio_nb_above_seuil": 0.3386815728816543,
                "mean_var_above_seuil": 6733.14848345593,
            },
            {
                "seuil_var_inf": 3.0,
                "seuil_var_supp": 3.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 474936.0,
                "mean_tranche_var": 3.0,
                "nb_above_seuil": 13139917,
                "sum_var_above_seuil": 89538475487.99966,
                "ratio_nb_above_seuil": 0.3346496557620107,
                "mean_var_above_seuil": 6814.234480172109,
            },
            {
                "seuil_var_inf": 3.0,
                "seuil_var_supp": 3.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 474936.0,
                "mean_tranche_var": 3.0,
                "nb_above_seuil": 12981605,
                "sum_var_above_seuil": 89538000551.99966,
                "ratio_nb_above_seuil": 0.33061773864236715,
                "mean_var_above_seuil": 6897.2981809259845,
            },
            {
                "seuil_var_inf": 3.0,
                "seuil_var_supp": 4.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 594951.2,
                "mean_tranche_var": 3.7580928798827626,
                "nb_above_seuil": 12823293,
                "sum_var_above_seuil": 89537405600.79965,
                "ratio_nb_above_seuil": 0.32658582152272364,
                "mean_var_above_seuil": 6982.403474739262,
            },
            {
                "seuil_var_inf": 4.0,
                "seuil_var_supp": 4.0,
                "nombre_ff_tranche": 158313,
                "sum_tranche_var": 633252.0,
                "mean_tranche_var": 4.0,
                "nb_above_seuil": 12664980,
                "sum_var_above_seuil": 89536772348.79967,
                "ratio_nb_above_seuil": 0.32255387893490883,
                "mean_var_above_seuil": 7069.633931423474,
            },
            {
                "seuil_var_inf": 4.0,
                "seuil_var_supp": 5.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 731992.6000000001,
                "mean_tranche_var": 4.6237341452322,
                "nb_above_seuil": 12506668,
                "sum_var_above_seuil": 89536040356.19966,
                "ratio_nb_above_seuil": 0.3185219618152653,
                "mean_var_above_seuil": 7159.064297237254,
            },
            {
                "seuil_var_inf": 5.0,
                "seuil_var_supp": 6.0,
                "nombre_ff_tranche": 158312,
                "sum_tranche_var": 811433.2999999995,
                "mean_tranche_var": 5.125532492799026,
                "nb_above_seuil": 12348356,
                "sum_var_above_seuil": 89535228922.89967,
                "ratio_nb_above_seuil": 0.31449004469562175,
                "mean_var_above_seuil": 7250.781312338231,
            },
        ],
    }

    tot_av = 0
    for i in range(len(calib3["buckets"])):
        tot_av += calib3["buckets"][i]["nombre_ff_tranche"]

    calib3p, front3 = get_minimal_frontiers(calib3, base, "rfr")

    tot_ap = 0
    for i in range(len(calib3p["buckets"])):
        tot_ap += calib3p["buckets"][i]["nombre_ff_tranche"]
    print(front3)
    print(calib3p)

    tc.assertEqual(tot_av, tot_ap)


# Base réduite
test_get_minimal_frontiers2(erfs_test)
{% endraw %} {% raw %}
calib_test["buckets"][1]
{% endraw %}

test_nb_zero

{% raw %}
def test_nb_zero(erfs, var_name, calib):

    nb_zero_erfs, nb_zero_pote = nb_zero(erfs, var_name, calib)
    assert nb_zero_erfs >= 41127
    assert nb_zero_pote == 61817


test_nb_zero(erfs_test, var_name, calib_test)
{% endraw %}

test_distrib_to_quantiles

{% raw %}
def test_distrib_to_quantiles(erfs_ff, var_name, calib):

    Distrib_ERFST, Distrib_POTET, calibT = distrib_to_quantiles(
        erfs_ff, var_name, calib
    )
    return Distrib_ERFST, Distrib_POTET, calibT


# Données minimales
Distrib_ERFST, Distrib_POTET, calibT = test_distrib_to_quantiles(
    erfs_ff2, var_name, calib2
)
assert Distrib_POTET.df.iloc[0]["nb_ff"] == 10
assert Distrib_POTET.df.iloc[4]["sum"] == 8000
assert Distrib_ERFST.df.iloc[4]["sum"] == 0
assert Distrib_ERFST.df.iloc[5]["nb_ff"] == 2
assert Distrib_ERFST.df.iloc[6]["sum"] == 75000

# Base réduite (On ne peut pas mettre de valeurs car il y a une création random)
Distrib_ERFST, Distrib_POTET, calibT = test_distrib_to_quantiles(
    erfs_test, var_name, calib_test
)
sampleT = Distrib_ERFST.bucket_list["6"].sample
assert Distrib_ERFST.df.iloc[6]["sum"] == (sampleT["wprm"] * sampleT[var_name]).sum()
# assert Distrib_ERFST.df.iloc[104]["sum"] == 0  # Bucket vide
assert (
    Distrib_ERFST.bucket_list["97"].seuil_max == Distrib_ERFST.df.iloc[97]["seuil_max"]
)
{% endraw %}

Création de quantiles pour les tests de calibration

Example Minimal

{% raw %}
Distrib_ERFS2, Distrib_POTE2, calib = distrib_to_quantiles(erfs_ff2, "rfr", calib2)
{% endraw %} {% raw %}
tracker2 = Tracker(frontieres_varT1)
{% endraw %}

Base réduite

{% raw %}
trackerT = Tracker(frontieres_varT)
{% endraw %} {% raw %}
Distrib_ERFST, Distrib_POTET, calib = distrib_to_quantiles(erfs_test, "rfr", calib_test)

fig3, error_avant_calib, final_error = compare_distributions(
    Distrib_ERFST.df,
    Distrib_POTET.df,
    "rfr",
    annee_erfs,
    annee_pote,
    log=False,
    title_suffix="avant_calib",
    df_cal=[],
)
{% endraw %}

test_seuil_chomage()

{% raw %}
 
{% endraw %} {% raw %}
def test_seuil_chomage(calib_in):
    m = len(calib_in["buckets"])
    calib_out = seuil_chomage(calib_in)
    tc.assertEqual(m, len(calib_in["buckets"]))
    tc.assertGreaterEqual(len(calib), len(calib_out))
    return calib_out


calib_out = test_seuil_chomage(calib_test)
# calib_out
tc.assertEqual(len(calib_out["buckets"]), 100)
tc.assertEqual(calib_out["buckets"][-1]["seuil_var_supp"], 365 * 241.22)
{% endraw %}

test_save_made_up_people()

{% raw %}
def test_save_made_up_people(erfs_ind):
    new_people_ind = erfs_ind[-4:]
    # print(new_people_ind)

    erfs_smup = save_made_up_people(erfs_ind, new_people_ind)

    return erfs_smup


erfs_test = erfs_test.sort_values(by="idfoy")
erfs_ind = erfs_ind.sort_values(by="idfoy")
erfs_ind.tail()
erfs_smup = test_save_made_up_people(erfs_ind)
erfs_smup[-10:]
{% endraw %}

test_ajout_gens_en_haut()

{% raw %}
def test_ajout_gens_en_haut(
    erfs, erfs_ind, var_name, Distrib_POTE, Distrib_ERFS, calib
):

    erfsT, _ = ajout_gens_en_haut(erfs, erfs_ind, var_name, Distrib_POTE, Distrib_ERFS)
    print("On a ajouté ", len(erfsT) - len(erfs), " foyers dans la base")
    assert len(erfsT) >= len(erfs)
    assert len(erfsT) == erfsT["idfoy"].nunique()

    # print(erfsT.tail())

    # On recalcule les quantiles
    Distrib_ERFS, Distrib_POTE, calib_final = distrib_to_quantiles(
        erfsT, var_name, calib
    )
    # print(Distrib_ERFS.df['nb_ff'])
    assert Distrib_ERFS.df[Distrib_ERFS.df["nb_ff"] == 0].empty is True
    # On observe l'ajout de gens
    figure, error_avant_calib, final_error_av = compare_distributions(
        Distrib_ERFS.df,
        Distrib_POTE.df,
        var_name,
        annee_erfs,
        annee_pote,
        log=False,
        title_suffix="avant_calibration",
        df_cal=[],
    )

    return erfsT, figure


# Données réduites
# Distrib_POTET.df["seuil_inf"]
# Distrib_ERFST.df["seuil_inf"]
_ = test_ajout_gens_en_haut(
    erfs_test, erfs_ind, var_name, Distrib_POTET, Distrib_ERFST, calib_test
)
{% endraw %}

Calibration

test_merge_and_replace

{% raw %}
def test_merge_and_replace():
    # Data
    base = pd.DataFrame(
        [
            [11, 0, 0],
            [12, 1, 0],
            [13, 1, 0],
            [4, 1, 46],
            [5, 1, 99],
            [6, 1, 90],
            [7, 1, 250],
            [8, 1, 300],
            [9, 1, 1000],
        ],
        columns=["idfoy", "wprm", "var_cal"],
    )

    sample = pd.DataFrame(
        [
            [11, 1, 0],
            [12, 1, 101],
            [4, 1, 46],
            [5, 1, 101],
            [7, 0, 250],
            [8, 1, 101],
            [78, 1, 101],
        ],
        columns=["idfoy", "wprm", "var_cal"],
    )

    base = merge_and_replace(base, sample, ["var_cal"])

    assert float(base[base["idfoy"] == 12]["var_cal"]) == 101  # On update la base
    assert float(base[base["idfoy"] == 5]["var_cal"]) == 101  # On update la base
    assert (
        float(base[base["idfoy"] == 13]["var_cal"]) == 0
    )  # Ceux hors sample ne changent pas
    assert (
        float(base[base["idfoy"] == 11]["wprm"]) == 0
    )  # On ne touche qu'à la colonne d'interet
    assert (
        base[base["idfoy"] == 78].empty is True
    )  # On n'ajoute pas les nouveaux foyers


test_merge_and_replace()
{% endraw %}

test_calib_empty_bucket()

{% raw %}
def test_calib_empty_bucket():

    tracker = Tracker([1, 3, 5, 14])
    tracker = calib_empty_bucket(2, tracker)
    assert tracker.df["final_error"].iloc[2] == 1
    assert tracker.df["error_type"][2] == "empty"


test_calib_empty_bucket()
{% endraw %}

test_compute_min_max_calib

{% raw %}
def test_compute_min_max_calib(var_name, bucket):
    not_fake_sumT = (
        bucket.sample[bucket.sample["fake_id"] == 0][var_name]
        * bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
    ).sum()
    sample_fakeT = bucket.sample[bucket.sample["fake_id"] == 1].copy()

    min_possibleT, max_possibleT, var_minT, var_maxT = compute_min_max_calib(
        var_name, bucket
    )

    assert min_possibleT == (var_minT * sample_fakeT["wprm"]).sum() + not_fake_sumT
    assert max_possibleT == (var_maxT * sample_fakeT["wprm"]).sum() + not_fake_sumT

    return min_possibleT, max_possibleT, var_minT, var_maxT


_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["3"])
_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["51"])
_ = test_compute_min_max_calib(var_name, Distrib_ERFST.bucket_list["89"])
# Données Minimales
min_possibleT, max_possibleT, var_minT, var_maxT = test_compute_min_max_calib(
    var_name, Distrib_ERFS2.bucket_list["5"]
)
assert var_minT == 10_000
assert var_maxT == 25_000 - 1
assert min_possibleT > 10_000 * Distrib_ERFS2.bucket_list["5"].nb_ff
assert max_possibleT < 25_000 * Distrib_ERFS2.bucket_list["5"].nb_ff
{% endraw %} {% raw %}
erfs_cal = individus_to_foyers_fiscaux(erfs_ind)
{% endraw %}

test_init_calib_zero

{% raw %}
def test_init_calib_zero(var_name, erfs_ind, bucket_erfs, Distrib_POTE, tracker):
    erfs_ind["var_cal"] = 0.1

    tracker, erfs_cal_ind = init_calib_zero(
        var_name, erfs_ind, bucket_erfs, Distrib_POTE.bucket_list[str(0)], tracker
    )
    erfss_cal = individus_to_foyers_fiscaux(erfs_cal_ind)
    nb_zero = Distrib_POTE.bucket_list["0"].nb_ff
    print(
        "Nb de gens à zéro post calib : ",
        round(erfss_cal[erfss_cal["var_cal"] == 0]["wprm"].sum(), 0),
        "\n",
    )

    tc.assertEqual(
        round(erfss_cal[erfss_cal["var_cal"] == 0]["wprm"].sum(), 0), nb_zero
    )


test_init_calib_zero(
    var_name,
    erfs_ff2test,
    Distrib_ERFS2.bucket_list["0"],
    Distrib_POTE2,
    tracker2,
)
test_init_calib_zero(
    var_name,
    erfs_ind,
    Distrib_ERFST.bucket_list["0"],
    Distrib_POTET,
    trackerT,
)
{% endraw %} {% raw %}
print(Distrib_POTE2.bucket_list["0"].nb_ff)
print(Distrib_POTE2.bucket_list["0"].seuil_inf)
print(Distrib_POTE2.bucket_list["0"].seuil_max)
{% endraw %}

test_amelioration_quantile

{% raw %}
def test_amelioration_quantile(i, tracker, Distrib_ERFS, Distrib_POTE):
    # On teste sur le bucket 2 des données minimales
    bucket = Distrib_ERFS.bucket_list[str(i)]
    sample_fake_in = bucket.sample[bucket.sample["fake_id"] == 1]
    sample_fake_in.loc[:, "var_cal"] = 0.1
    tracker.df["wanted"][str(i)] = Distrib_POTE.bucket_list[str(i)].sum_

    not_fake_sum = (
        bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
        * bucket.sample[bucket.sample["fake_id"] == 0]["rfr"]
    ).sum()
    error_in = (
        (bucket.sample["rfr"] * bucket.sample["wprm"]).sum()
        - Distrib_POTE.bucket_list[str(i)].sum_
    ) / Distrib_POTE.bucket_list[str(i)].sum_

    changing_var_name = "rfr"

    # Test
    print("Error before", error_in)
    improvement, current_sum, error, sample_fake_en_cours = amelioration_quantile(
        i, tracker, sample_fake_in, not_fake_sum, error_in, changing_var_name
    )
    print("Final error", error)

    assert abs(error) < abs(error_in)


test_amelioration_quantile(2, tracker2, Distrib_ERFS2, Distrib_POTE2)
test_amelioration_quantile(72, trackerT, Distrib_ERFST, Distrib_POTET)
{% endraw %}

test_optimization_quantile

{% raw %}
def test_optimization_quantile(erfss, i, tracker, var_name, Distrib_ERFS, Distrib_POTE):
    # On teste sur le bucket 2 des données minimales
    err_max = 0.01
    bucket = Distrib_ERFS.bucket_list[str(i)]
    sample_fake = bucket.sample[bucket.sample["fake_id"] == 1]
    sample_fake.loc[:, "var_cal"] = 0.1
    tracker.df["wanted"][str(i)] = Distrib_POTE.bucket_list[str(i)].sum_

    not_fake_sum = (
        bucket.sample[bucket.sample["fake_id"] == 0]["wprm"]
        * bucket.sample[bucket.sample["fake_id"] == 0]["rfr"]
    ).sum()
    error_in = (
        (bucket.sample["rfr"] * bucket.sample["wprm"]).sum()
        - Distrib_POTE.bucket_list[str(i)].sum_
    ) / Distrib_POTE.bucket_list[str(i)].sum_
    changing_var_name = "rfr"
    print("Erreur avant toute calibration", error_in)

    erfss, tracker, sample_fake = optimization_quantile(
        erfss,
        i,
        tracker,
        sample_fake,
        not_fake_sum,
        error_in,
        changing_var_name,
        err_max,
    )

    error = (
        not_fake_sum
        + (sample_fake["wprm"] * sample_fake["var_cal"]).sum()
        - tracker.df["wanted"][str(i)]
    ) / tracker.df["wanted"][str(i)]
    print("final opt error", error)
    assert abs(error) < abs(error_in)


test_optimization_quantile(erfs_ff2, 2, tracker2, "rfr", Distrib_ERFS2, Distrib_POTE2)
test_optimization_quantile(erfs_test, 78, trackerT, "rfr", Distrib_ERFST, Distrib_POTET)
{% endraw %}

[ESSAIS] Calibration d'un jeu de données

Calibration quantiles

{% raw %}
err_max = 0.01
erfs_02_ind = erfs_ff2test
tracker2, erfs_cal_ff2, erfs_cal_ind2 = calibration_quantiles(
    var_name, erfs_02_ind, Distrib_ERFS2, Distrib_POTE2, err_max
)
erfs_cal_ff2.head()
{% endraw %} {% raw %}
var_name = "rfr"
err_max = 0.01

trackerT, erfs_cal_test, _ = calibration_quantiles(
    var_name, erfs_ind, Distrib_ERFST, Distrib_POTET, err_max
)
{% endraw %} {% raw %}
Distrib_CALT, Distrib_POTET, calib = distrib_to_quantiles(
    erfs_cal_test, "rfr", calib_test
)

fig4, error_post_calib, final_error = compare_distributions(
    Distrib_ERFST.df,
    Distrib_POTET.df,
    "rfr",
    annee_erfs,
    annee_pote,
    log=False,
    title_suffix="_N_3",
    df_cal=Distrib_CALT.df,
)
Distrib_CALT.df
{% endraw %}

Calibration complète

{% raw %}
# On fait un test
var_name = "chomage_brut"
erfs_cal_ff, erfs_cal_ind, Distribs, fig_var_cal = calibration(
    erfs_ind, var_name, annee_erfs, annee_pote, calib=calib_test
)
{% endraw %} {% raw %}
fig_var_cal
plt.show()
{% endraw %} {% raw %}
erfs_cal_ff.columns
erfs_cal_ind.columns
{% endraw %}