from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"Ajout d’individus dans notre base de données
Comme la calibration s’applique à varier les poids des individus existants dans l’ERFS-FPR, il faut ajouter quasi manuellement les profils manquants (souvent les hauts revenus). Pour cela, on repère les déciles de POTE qui sont inexistants dans la base ERFS-FPR.
Imports
import random
import unittest
from collections import Counter
import pandas as pd
from leximpact_common_python_libraries.config import Configuration
from leximpact_survey_scenario.scenario_tools.helpers_survey_scenario import (
distrib_to_quantiles,
get_quantiles_casd,
individus_to_entity,
pote_comparison,
)
config = Configuration(project_folder="leximpact-prepare-data")
tc = unittest.TestCase()# log.debug(config)
import numpy as np
from leximpact_common_python_libraries.logger import logger as log
from leximpact_prepare_data.helpers_survey_scenario import compare_distributions
from leximpact_prepare_data.leximpact_survey_scenario_tools import (
LeximpactErfsSurveyScenario,
)
# from leximpact_socio_fisca_simu_etat.logger import logger as log!ls {config.get("ERFS")}01_erfs_red_2018.h5
01_erfs_reduced_2018.h5
01_erfs_reduced_ind2018.h5
02_erfs_enlarged_2018.h5
02_erfs_enlarged_2018_dev.h5
02_erfs_enlarged_ind2018.h5
03_erfs_rfr_2018.h5
03_erfs_rfr_cal_ff2018.h5
03_erfs_rfr_cal_ind2018.h5
03_erfs_rfr_ind2018.h5
04_erfs_rfr_2018.h5
04_erfs_var_copules_2018.h5
04_erfs_var_copules_2018_dev.h5
05_erfs_ff2018_inflated_to_2018.h5
05_erfs_ff2018_inflated_to_2019.h5
05_erfs_ind2018_inflated_to_2018.h5
05_erfs_ind2018_inflated_to_2019.h5
06_erfs_ff_2018_aged_2021_dev.h5
07_erfs_copules_ff2018_inflated_to_2018.h5
07_erfs_copules_ff2018_inflated_to_2019.h5
07_erfs_copules_ind2018_inflated_to_2018.h5
07_erfs_copules_ind2018_inflated_to_2019.h5
07_erfs_salaire_to_cal_ind2018_inflated_to_2019.h5
08_erfs_salaire_cal_ind2018_inflated_to_2019.h5
base_2021
dummy_data_final.h5
erfs_final_ff_2018_aged_to_2021_N_3.h5
erfs_final_ind_2018_aged_to_2021_N_3.h5
erfs_flat_2018.h5
erfs_var_copules_2018_dev.h5
input-2022-09-19_branche_update_off.h5
pipeline_2022
sample_pop.csv
sample_pop_ff.csv
sample_pop_ff_csg.csv
sample_pop_ind.csv
sample_pop_ind_csg.csv
sdv.csv
test
# Import des années
annee_erfs = config.get("YEAR_ERFS")
annee_pote = config.get("YEAR_POTE")
annee_de_calcul = config.get("YEAR_COMPUTATION")Fonction d’ajout d’individus
Old code
Ajout de foyers/individus dans la distribution
"""
Ajout de foyers (qui sont égaux à un unique individu) dans la distribution
"""
def ajout_gens(erfs_ind, var_name, Distrib_POTE, Distrib_ERFS):
initial_length = len(erfs_ind)
buckets = Distrib_ERFS.bucket_list
list_missing = []
# On parcourt les buckets
for i in range(len(buckets)):
idx = buckets[str(i)].nb
# Si on a un bucket vide alors que le quantile n'est pas vide, et qu'on n'est pas dans le 1er bucket
if (
(buckets[str(i)].nb_ff == 0)
and (Distrib_POTE.bucket_list[str(i)].nb != 0)
and (idx != 0)
):
list_missing.append(idx)
print("List missing", list_missing) # debug
# Si besoin on crée des individus
if len(list_missing) > 0:
print("On a ", len(list_missing), "buckets vides")
new_ff = create_ppl_HR(
erfs_ind, var_name, list_missing, Distrib_POTE, Distrib_ERFS
)
tc.assertGreater(len(new_ff), 0)
# On sauvegarde cela dans la nouvelle base
erfs_ind_new = save_new_ff_to_ind(erfs_ind, new_ff)
assert initial_length < len(erfs_ind_new)
else:
erfs_ind_new = erfs_ind.copy(deep=True)
return erfs_ind_new'\n Ajout de foyers (qui sont égaux à un unique individu) dans la distribution\n'
Création de nouvelles personnes piochées dans un bucket
"""
Création de nouvelles personnes
On notera qu'on ajoute uniquement des déclarants principaux, donc ici foyer=gens
"""
def create_ppl_HR(erfs_ind, var_name, list_missing, Distrib_POTE, Distrib_ERFS):
# On pioche pour chaque bucket le nombre de personnes manquantes
for id_x in list_missing:
# 0 - Choix de l'échantillon où piocher
erfs_ff = individus_to_entity(erfs_ind, "foyer_fiscal")
echantillon_ou_piocher = get_similar_sample(
id_x, erfs_ff, var_name, list_missing, Distrib_POTE, Distrib_ERFS
)
assert (
echantillon_ou_piocher.empty is False
), "Attention, l'échantillon où piocher est vide"
# 1 - Choix du nombre de foyers que l'on doit ajouter
Nb_wanted = Distrib_POTE.bucket_list[str(id_x)].nb_ff
seuil_inf = Distrib_POTE.bucket_list[str(id_x)].seuil_inf
seuil_max = Distrib_POTE.bucket_list[str(id_x)].seuil_max
# Si c'est trop, on en prend au max 0.5% de la population
if Nb_wanted > 1000:
Nb = int(round(len(erfs_ff) / 200, 0))
# print("inf 1000", Nb)
else:
Nb = int(Nb_wanted)
# print("Else", Nb)
# 2 - Sélection au hasard de gens
print(
"On ajoute ",
Nb,
" personnes dans le bucket ",
id_x,
"entre ",
seuil_inf,
" et ",
seuil_max,
)
# Si on peut, on essaye de ne pas prendre plusieurs fois le meme foyer
if Nb >= len(echantillon_ou_piocher):
new_ff = echantillon_ou_piocher.sample(n=Nb, replace=True).reset_index(
drop=True
)
else:
new_ff = echantillon_ou_piocher.sample(n=Nb, replace=False).reset_index(
drop=True
)
# print("echantillon \n", echantillon_ou_piocher)
# TODO : on fixe le random-state pour plus de stabilité?
# 3 - On calibre
sum_wanted = Distrib_POTE.bucket_list[str(id_x)].sum_
new_ff_cal = calibrate_new_ff(
new_ff, Nb_wanted, var_name, seuil_inf, seuil_max, sum_wanted
)
print("new_ff", len(new_ff_cal)) # debug
#
save_new_ff_to_ind
return new_ff_cal"\n Création de nouvelles personnes\n On notera qu'on ajoute uniquement des déclarants principaux, donc ici foyer=gens\n"
Obtention d’un échantillon où piocher des foyers à dupliquer
def get_similar_sample(
id_, erfs_ff, var_name, list_missing, Distrib_POTE, Distrib_ERFS
):
"""On décide de sélectionner des profils "proches",
donc dans les 2 buckets non vides précédant le bucket vide,
ou les 2 suivants s'il n'y en a pas
"""
not_missing = list(
(
Counter(Distrib_POTE.df.index.values.astype(int)) - Counter(list_missing)
).elements()
)
# We do not account for the first bucket
if not_missing[0] == 0:
not_missing = not_missing[1:]
# Getting last 2 not empty
ids = []
for i in reversed(not_missing):
# Finding 2 buckets below
if (i < id_) and (len(ids) < 2):
ids.append(i)
# If we couldn't find 2 buckets below
if len(ids) < 2:
for i in not_missing:
if (i > id_) and (len(ids) < 2):
ids.append(i)
assert len(ids) == 2
# Selected sample
id_min = min(ids)
id_max = max(ids)
sample = erfs_ff.loc[
(erfs_ff[var_name] >= Distrib_POTE.bucket_list[str(id_min)].seuil_inf)
& (erfs_ff[var_name] < Distrib_POTE.bucket_list[str(id_max)].seuil_max)
].copy(deep=True)
assert sample.empty is False, "Attention, l'échantillon où piocher est vide"
return sampleCalibration des foyers nouvellement créés
def calibrate_new_ff(new_ff, Nb_wanted, var_name, seuil_inf, seuil_max, sum_wanted):
# Calibration des poids
rate = Nb_wanted / (new_ff["wprm"].sum())
new_ff.loc[:, "wprm"] = new_ff["wprm"].apply(lambda row: row * rate)
tc.assertAlmostEqual(new_ff["wprm"].sum(), Nb_wanted)
# Calibration de la variable d'intéret
# 1 - On leur attribue une valeur comprise dans le bucket (+/- 5% pour avoir une marge)
new_ff.loc[:, var_name] = [
random.randint(
round(seuil_inf, 0) + 10, round(seuil_max, 0) - 10
) # round(1.05*seuil_inf), round(0.95*seuil_max))
for ppl in range(len(new_ff))
]
sum_ponderee = (new_ff[var_name] * new_ff["wprm"]).sum()
# 2 - On recalibre
error = 1
while error > 0.001:
coef = sum_wanted / sum_ponderee
# On update les valeurs
new_ff.loc[:, var_name] = new_ff[var_name].apply(
lambda row: max(min((row * coef), seuil_max - 1), seuil_inf + 1)
)
sum_ponderee = (new_ff[var_name] * new_ff["wprm"]).sum()
error = (sum_ponderee - sum_wanted) / sum_wanted
tc.assertAlmostEqual(error, 0, places=0)
return new_ffInjection des nouveaux foyers dans la base individus
""" Mise en forme des foyers créés dans la base en individus """
def save_new_ff_to_ind(erfs_ind, new_ff):
# On cherche les individus qui composent les nouveaux foyers fiscaux
id_list = list(new_ff["foyer_fiscal_id"])
new_people = erfs_ind[erfs_ind["foyer_fiscal_id"].isin(id_list)]
# On leur attribue les nouveaux poids calibrés
for id_foy in id_list:
new_people[new_people["foyer_fiscal_id"] == id_foy]["wprm"] = new_ff[
new_ff["foyer_fiscal_id"] == id_foy
]["wprm"]
# new_people = new_people.copy().reset_index(drop=True)
new_people = new_people.reset_index(drop=True)
# Création des Idfoy, Idfam, Idmen uniques
max_idfoy = (
max(erfs_ind["foyer_fiscal_id"]) + 1
) # S'update à chaque ajout de bucket
dict_ = {}
i = 1
new_people["new_foyer_fiscal_id"] = new_people["foyer_fiscal_id"]
for id_foy in id_list:
dict_[id_foy] = max_idfoy + i
new_people[new_people["foyer_fiscal_id"] == id_foy]["famille_id"] = (
max_idfoy + i
)
new_people[new_people["foyer_fiscal_id"] == id_foy]["menage_id"] = max_idfoy + i
new_people[new_people["foyer_fiscal_id"] == id_foy]["new_foyer_fiscal_id"] = (
max_idfoy + i
)
i += 1
new_people["foyer_fiscal_id"] = new_people["new_foyer_fiscal_id"]
new_people.drop("new_foyer_fiscal_id", axis=1)
# TODO PROPAGER AUX POIDS des autres entites
tc.assertEqual(new_people.isna().sum().sum(), 0)
# On ajoute ces gens dans notre base en individus
# new_ff.drop(['index'], axis=1)
erfs_av = erfs_ind.copy() # Pour le test
erfs_ind = pd.concat([erfs_ind, new_people])
erfs_ind = erfs_ind.reset_index(drop=True)
tc.assertEqual(len(erfs_ind), len(erfs_av) + len(new_people))
return erfs_ind' Mise en forme des foyers créés dans la base en individus '
New code
Ajout d’individus dans les quantiles vides
def add_missing_indiv(scenario, annee_ajout, variable):
# On va chercher le fichier de quantiles
quantiles = get_quantiles_casd(variable)
# On va chercher la base individus
var_list = scenario._leximpact_variables
base_indiv = scenario.create_data_frame_by_entity(
variables=var_list, period=annee_ajout, merge=True
)
# On découpe la base en buckets de notre variable d'intérêt
Distrib_BASE, Distrib_POTE, quantiles = distrib_to_quantiles(
individus_to_entity(base_indiv, "foyer_fiscal"), variable, quantiles
)
# On identifie les buckets vides, on fabrique des individus et on les mets dans la base individus
new_base_ind = ajout_gens(base_indiv, variable, Distrib_POTE, Distrib_BASE)
new_base_ff = individus_to_entity(new_base_ind, "foyer_fiscal")
# Plot des histogrammes de la new base
fig, error_df, final_error = pote_comparison(
individus_to_entity(base_indiv, "foyer_fiscal"),
variable,
title_suffix="",
log=True,
base_ff_cal=new_base_ff,
)
print("Avant", len(base_indiv), "apres", len(new_base_ind))
# On remplace la base par la nouvelle base dans notre scenario
print(
"ATTENTION TRAVAIL EN COURS \n Il faudra encore inserer cette base dans un scenario"
)
return new_base_ind, Distrib_BASE# Creation d'une base de test
variable = "rfr"
annee_ajout = 2019
leximpact_scenario = LeximpactErfsSurveyScenario(final_year=2022)
for year in range(int(annee_erfs), int(annee_de_calcul) + 1):
rfr = np.array(leximpact_scenario.calculate_variable("rfr", period=year))
leximpact_scenario.simulation.set_input("rfr", year, rfr)
# print("max ap", max(rfr))
# Avant
base_indiv = leximpact_scenario.create_data_frame_by_entity(
variables=leximpact_scenario._leximpact_variables, period=annee_ajout, merge=True
)
base_ff = individus_to_entity(base_indiv, "foyer_fiscal")
Distrib_BASE, Distrib_POTE, quantiles = distrib_to_quantiles(
base_ff, variable, get_quantiles_casd(variable)
)
# fig, error_df, final_error = pote_comparison(
# base_ff, variable, title_suffix="", log=True
# )Données du scénario :
{'input_data_table_by_entity_by_period': {2018: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2019: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2020: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2021: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2022: {'individu': 'individu_2018', 'menage': 'menage_2018'}}, 'survey': 'input'}
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
# Ajout de gens
new_base, Distrib_BASE = add_missing_indiv(leximpact_scenario, annee_ajout, variable)
new_base_ff = individus_to_entity(new_base, "foyer_fiscal")On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
List missing [103, 104]
On a 2 buckets vides
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On ajoute 333 personnes dans le bucket 103 entre 8131520.0 et 29636994.0
new_ff 333
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On ajoute 37 personnes dans le bucket 104 entre 29636994.0 et 1000000001
new_ff 37
On a une base initiale de 112305 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Index(['weight_individus', 'weight_familles', 'weight_foyers',
'foyer_fiscal_id', 'quifam', 'retraite_brute',
'cotisation_sociale_mode_recouvrement', 'ric',
'traitement_indiciaire_brut', 'quifoy', 'activite', 'quimen',
'heures_remunerees_volume', 'rag', 'retraite_imposable',
'primes_fonction_publique', 'rnc', 'salaire_de_base',
'autonomie_financiere', 'pensions_alimentaires_percues', 'idfam',
'idfoy', 'idmen', 'chomage_imposable', 'chomage_brut',
'effectif_entreprise', 'famille_id', 'menage_id', 'menage_role',
'menage_position', 'rfr', 'f4ba', 'statut_occupation_logement',
'weight_menages', 'wprm'],
dtype='object')
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Somme des poids avant calibration 32640040.0
Somme des poids après calibration 32640040.0
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Erreur moyenne des buckets de rfr : 67.5692043967975 %
Erreur min : 7.452301043677421 erreur max: 725.0742587983459
Total de rfr avant : 1901588400000.0 et après calibration : 1916493900000.0
Avant 112144 apres 112305
ATTENTION TRAVAIL EN COURS
Il faudra encore inserer cette base dans un scenario
On a une base initiale de 112305 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/tmp/ipykernel_37717/2208608261.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["wprm"] = new_ff[
/tmp/ipykernel_37717/2208608261.py:29: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["famille_id"] = (
/tmp/ipykernel_37717/2208608261.py:32: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["menage_id"] = max_idfoy + i
/tmp/ipykernel_37717/2208608261.py:33: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["new_foyer_fiscal_id"] = (
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()

1 / 0ZeroDivisionError: division by zero
def test_add_missing_indiv_rfr():
# Creation d'une base de test
variable = "rfr"
annee_ajout = 2019
leximpact_scenario = LeximpactErfsSurveyScenario(final_year=2022)
for year in range(int(annee_erfs), int(annee_de_calcul) + 1):
rfr = np.array(leximpact_scenario.calculate_variable("rfr", period=year))
leximpact_scenario.simulation.set_input("rfr", year, rfr)
# print("max ap", max(rfr))
# Avant
base_indiv = leximpact_scenario.create_data_frame_by_entity(
variables=leximpact_scenario._leximpact_variables,
period=annee_ajout,
merge=True,
)
base_ff = individus_to_entity(base_indiv, "foyer_fiscal")
Distrib_BASE, Distrib_POTE, quantiles = distrib_to_quantiles(
base_ff, variable, get_quantiles_casd(variable)
)
fig, error_df, final_error = pote_comparison(
base_ff, variable, title_suffix="", log=True
)
# Ajout de gens
Distrib_BASE3, Distrib_POTE3 = add_missing_indiv(
leximpact_scenario, annee_ajout, variable
)
fig, error_df, final_error = pote_comparison(
Distrib_BASE.df,
variable,
title_suffix="",
log=True,
base_ff_cal=Distrib_BASE3.df,
)
# Validation
# assert pas de bucket vide
tc.assertEqual((Distrib_BASE3.df["nb_ff"] == 0).unique() is False, True)
# assert sum de var X dans le bucket ~ celle du bucket de POTE
tc.assertEqual(Distrib_BASE.df["sum"], Distrib_POTE3.df["sum"])
# assert moyenne des autres var n'a pas changé
print("en cours")
test_add_missing_indiv_rfr()Données du scénario :
{'input_data_table_by_entity_by_period': {2018: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2019: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2020: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2021: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2022: {'individu': 'individu_2018', 'menage': 'menage_2018'}}, 'survey': 'input'}
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Index(['weight_individus', 'weight_familles', 'weight_foyers',
'foyer_fiscal_id', 'quifam', 'retraite_brute',
'cotisation_sociale_mode_recouvrement', 'ric',
'traitement_indiciaire_brut', 'quifoy', 'activite', 'quimen',
'heures_remunerees_volume', 'rag', 'retraite_imposable',
'primes_fonction_publique', 'rnc', 'salaire_de_base',
'autonomie_financiere', 'pensions_alimentaires_percues', 'idfam',
'idfoy', 'idmen', 'chomage_imposable', 'chomage_brut',
'effectif_entreprise', 'famille_id', 'menage_id', 'menage_role',
'menage_position', 'rfr', 'f4ba', 'statut_occupation_logement',
'weight_menages', 'wprm'],
dtype='object')
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Somme des poids avant calibration 32640040.0
Erreur moyenne des buckets de rfr : 66.47112853432229 %
Erreur min : 7.452301043677421 erreur max: 725.0742587983459
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
List missing [103, 104]
On a 2 buckets vides
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On ajoute 333 personnes dans le bucket 103 entre 8131520.0 et 29636994.0
new_ff 333
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On ajoute 37 personnes dans le bucket 104 entre 29636994.0 et 1000000001
new_ff 37
On a une base initiale de 112294 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
On a une base initiale de 112144 individus pour 56935 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 56935 foyer_fiscal apres le groupby
Index(['weight_individus', 'weight_familles', 'weight_foyers',
'foyer_fiscal_id', 'quifam', 'retraite_brute',
'cotisation_sociale_mode_recouvrement', 'ric',
'traitement_indiciaire_brut', 'quifoy', 'activite', 'quimen',
'heures_remunerees_volume', 'rag', 'retraite_imposable',
'primes_fonction_publique', 'rnc', 'salaire_de_base',
'autonomie_financiere', 'pensions_alimentaires_percues', 'idfam',
'idfoy', 'idmen', 'chomage_imposable', 'chomage_brut',
'effectif_entreprise', 'famille_id', 'menage_id', 'menage_role',
'menage_position', 'rfr', 'f4ba', 'statut_occupation_logement',
'weight_menages', 'wprm'],
dtype='object')
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Somme des poids avant calibration 32640040.0
Somme des poids après calibration 32640040.0
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
Erreur moyenne des buckets de rfr : 67.1858188527913 %
Erreur min : 7.452301043677421 erreur max: 725.0742587983459
Total de rfr avant : 1901588400000.0 et après calibration : 1915776000000.0
Avant 112144 apres 112294
ATTENTION TRAVAIL EN COURS
Il faudra encore inserer cette base dans un scenario
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/tmp/ipykernel_37717/2208608261.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["wprm"] = new_ff[
/tmp/ipykernel_37717/2208608261.py:29: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["famille_id"] = (
/tmp/ipykernel_37717/2208608261.py:32: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["menage_id"] = max_idfoy + i
/tmp/ipykernel_37717/2208608261.py:33: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
new_people[new_people["foyer_fiscal_id"] == id_foy]["new_foyer_fiscal_id"] = (
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
AttributeError: 'DataFrame' object has no attribute 'df'


Exemple Minimum
Initialisation
leximpact_scenario = LeximpactErfsSurveyScenario(final_year=2022)
for year in range(int(annee_erfs), int(annee_de_calcul) + 1):
rfr = leximpact_scenario.calculate_variable("rfr", period=year)
# leximpact_scenario.simulation.set_input("rfr", year, rfr)Données du scénario :
{'input_data_table_by_entity_by_period': {2018: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2019: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2020: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2021: {'individu': 'individu_2018', 'menage': 'menage_2018'}, 2022: {'individu': 'individu_2018', 'menage': 'menage_2018'}}, 'survey': 'input'}
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
/home/jupyter-sasha/.cache/pypoetry/virtualenvs/leximpact-prepare-data-77FW3yLw-py3.8/lib/python3.8/site-packages/openfisca_core/holders/holder.py:167: Warning: You cannot set a value for the variable loyer, as it has been neutralized. The value you provided ([0. 0. 0. ... 0. 0. 0.]) will be ignored.
return warnings.warn(
var_list = leximpact_scenario._leximpact_variables
base_ind = leximpact_scenario.create_data_frame_by_entity(
variables=var_list, period=2019, merge=True
)base_ind.keys()Index(['quifam', 'retraite_brute', 'cotisation_sociale_mode_recouvrement',
'ric', 'traitement_indiciaire_brut', 'quifoy', 'activite', 'quimen',
'heures_remunerees_volume', 'rag', 'retraite_imposable',
'primes_fonction_publique', 'rnc', 'categorie_salarie',
'salaire_de_base', 'autonomie_financiere', 'statut_marital', 'age',
'pensions_alimentaires_percues', 'contrat_de_travail', 'idfam', 'idfoy',
'idmen', 'chomage_imposable', 'chomage_brut', 'date_naissance',
'weight_individus', 'effectif_entreprise', 'famille_id', 'famille_role',
'famille_position', 'foyer_fiscal_id', 'foyer_fiscal_role',
'foyer_fiscal_position', 'menage_id', 'menage_role', 'menage_position',
'weight_familles', 'weight_foyers', 'rfr', 'f4ba',
'statut_occupation_logement', 'weight_menages', 'wprm'],
dtype='object')
Plot des quantiles
variable = "rfr"
quantiles = get_quantiles_casd(variable)Distrib_BASE, Distrib_POTE, quantiles = distrib_to_quantiles(
individus_to_entity(base_ind, "foyer_fiscal"), variable, quantiles
)On a une base initiale de 16767 individus pour 9562 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 9562 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
Distrib_BASE.df| seuil_inf | seuil_max | middle | nb_ff | sum | mean | |
|---|---|---|---|---|---|---|
| 0 | 0 | 0.5 | 0.25 | 3572896 | 0.0 | 0.0 |
| 1 | 0.5 | 326.0 | 163.25 | 103627 | 15897209.0 | 85468.867188 |
| 2 | 326.0 | 1145.0 | 735.5 | 254752 | 189531472.0 | 428804.25 |
| 3 | 1145.0 | 2058.0 | 1601.5 | 253884 | 406238272.0 | 933881.0625 |
| 4 | 2058.0 | 3000.0 | 2529.0 | 299030 | 749223808.0 | 1454803.5 |
| ... | ... | ... | ... | ... | ... | ... |
| 100 | 148696.0 | 461119.5 | 304907.75 | 0 | 0.0 | NaN |
| 101 | 461119.5 | 1917075.5 | 1189097.5 | 0 | 0.0 | NaN |
| 102 | 1917075.5 | 8131520.0 | 5024297.75 | 0 | 0.0 | NaN |
| 103 | 8131520.0 | 29636994.0 | 18884257.0 | 0 | 0.0 | NaN |
| 104 | 29636994.0 | 1000000001 | 514818497.5 | 0 | 0.0 | NaN |
105 rows × 6 columns
fig, error_df, final_error = compare_distributions(
Distrib_BASE.df, Distrib_POTE.df, variable, annee_erfs, annee_pote, "Avant", log
)Erreur moyenne des buckets de rfr : 94.25614025514825 %
Erreur min : 19.966570932336598 erreur max: 100.0

Création d’une base de test (où il manque des gens)
base_ind.sort_values(by=variable, inplace=True)
base_ind[variable]
base_ind = base_ind[base_ind[variable] < 5000]
# leximpact_scenario.delete_arrays()88073 0.0
66781 0.0
66782 0.0
66783 0.0
66784 0.0
...
83863 4995.0
59084 4995.0
83860 4995.0
83862 4995.0
83861 4995.0
Name: rfr, Length: 16767, dtype: float32
Distrib_BASE2, Distrib_POTE2, quantiles = distrib_to_quantiles(
individus_to_entity(base_ind, "foyer_fiscal"), variable, quantiles
)On a une base initiale de 16767 individus pour 9562 foyer_fiscal s
On regroupe la base selon ['weight_individus', 'weight_familles', 'weight_foyers', 'foyer_fiscal_id']
On a 9562 foyer_fiscal apres le groupby
Nombre de foyers de rfr à zéro, dans l'ERFS : 3572896.0 et dans POTE : 2789455 soit un écart de : 28.08580887664436 %
[0, 0.5, 326.0, 1145.0, 2058.0, 3000.0, 3801.0, 4591.0, 5307.0, 5691.0, 6294.0, 6865.0, 7383.0, 7879.0, 8364.0, 8829.0, 9278.0, 9703.0, 10103.0, 10496.0, 10881.0, 11230.0, 11589.0, 11919.0, 12263.0, 12609.0, 12959.0, 13388.0, 13774.0, 14140.0, 14422.0, 14700.0, 14967.0, 15234.0, 15499.0, 15781.0, 16062.0, 16347.0, 16634.0, 16920.0, 17209.0, 17504.0, 17804.0, 18103.0, 18409.0, 18726.0, 19048.0, 19374.0, 19701.0, 20041.0, 20393.0, 20757.0, 21132.0, 21514.0, 21909.0, 22323.0, 22794.0, 23280.0, 23874.0, 24580.0, 25122.0, 25628.0, 26155.0, 26696.0, 27250.0, 27819.0, 28406.0, 29012.0, 29635.0, 30283.0, 30954.0, 31648.0, 32366.0, 33105.0, 33872.0, 34669.0, 35505.0, 36337.0, 37258.0, 38222.0, 39230.0, 40298.0, 41430.0, 42618.0, 43892.0, 45255.0, 46727.0, 48312.0, 50031.0, 51916.0, 54000.0, 56320.0, 58942.0, 61967.0, 65530.0, 69841.0, 75272.0, 82475.0, 92857.0, 110143.0, 148696.0, 461119.5, 1917075.5, 8131520.0, 29636994.0, 1000000001]
On étudie la distribution en 105 buckets, avec un min de 0 et un max de 1000000001 € de rfr
Dans POTE, on a 2789455 foyers de rfr == 0
Dans l'ERFS, on a 3572896.0 foyers de rfr == 0
/home/jupyter-sasha/leximpact-prepare-data/leximpact_prepare_data/helpers_survey_scenario.py:158: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
sample_entity = sample_individus.groupby(wprm_list, as_index=False).sum()
fig, error_df, final_error = compare_distributions(
Distrib_BASE2.df, Distrib_POTE.df, variable, annee_erfs, annee_pote, "Avant", log
)Erreur moyenne des buckets de rfr : 94.25614025514825 %
Erreur min : 19.966570932336598 erreur max: 100.0

Ajout de gens dans la base de test
Distrib_BASE3, Distrib_POTE3 = add_missing_indiv(
leximpact_scenario, annee_pote, variable
)fig, error_df, final_error = pote_comparison(
Distrib_BASE.df, variable, title_suffix="", log=True, base_ff_cal=Distrib_BASE3.df
)Essais et tâtonnements
TESTS
def test_distrib_to_quantiles():
variable = "retraite_brute"
scenario = LeximpactErfsSurveyScenario(final_year=2022)
quantiles = get_quantiles_casd(variable)
# On va chercher notre base
ids = [
"menage_ordinaire",
"menage_ordinaire_familles",
"menage_ordinaire_foyers_fiscaux",
"menage_ordinaire_individus",
"weight_familles",
"weight_foyers",
"weight_individus",
"weight_menages",
"wprm",
"wprm_init",
]
var_list = scenario.used_as_input_variables + ids
base_indiv = scenario.create_data_frame_by_entity(
variables=var_list, period=2019, merge=True
)
Distrib_BASE, Distrib_POTE, quantiles = distrib_to_quantiles(
base_indiv, variable, quantiles
)
assert Distrib_BASE.df.equals(Distrib_POTE.df) == 0
print("en cours")
test_distrib_to_quantiles()def test_add_missing_indiv_retraite():
variable = "retraite_imposable"
leximpact_scenario = LeximpactErfsSurveyScenario(final_year=2022)
add_missing_indiv(leximpact_scenario, 2019, variable)
test_add_missing_indiv_retraite()