--- title: Application de Monte-Carlo pour ajouter des données à l' ERFS-FPR keywords: fastai sidebar: home_sidebar nb_path: "notebooks/retraitement_erfs-fpr/modules/copules_03_add_var.ipynb" ---
Dans le fichier précédent nous avons validé la pertinence de Monte-Carlo en comparant ses performances à une donnée existance.
Nous allons maintenant ajouter une donnée qui n'existe pas dans l'ERFS-FPR.
Nous allons pour cela utiliser les copules des revenus du capitale de POTE pour injecter cette variable dans l'ERFS-FPR.
import math
import random
from time import time
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns
# from sklearn.metrics import r2_score
from tqdm.notebook import tqdm
from leximpact_prepare_data.calib_and_copules import (
calib_to_df,
copules_to_df,
get_calib,
get_copules,
get_fake_data,
pandas_to_vaex,
)
from leximpact_prepare_data.calibration_tools import distrib_to_quantiles
from leximpact_prepare_data.toolbase import (
foyers_fiscaux_to_individus,
individus_to_foyers_fiscaux,
)
sns.set(rc={"figure.figsize": (20, 8)})
filepath = (
config.get("DATA_OUT") + "03_erfs_rfr_cal_ind" + config.get("YEAR_ERFS") + ".h5"
)
# print(filepath)
erfs_03 = pd.read_hdf(filepath)
erfs_03.columns
sample_pop_ff = individus_to_foyers_fiscaux(erfs_03)
!ls {config.get("COPULES")}
with open(
config.get("COPULES") + "ExportCopule-2019-assiette_csg_revenus_capital.json"
) as fichier:
dict_copules_RFR_Data = json.load(fichier)
df_copules = copules_to_df(dict_copules_RFR_Data)
df_copules
data_to_process = [
{
"file": "ExportCopule-2019-assiette_csg_revenus_capital.json",
"column_name": "pote_rev_capital",
"col_name_export_pote": "assiette_csg_revenus_capital",
},
# {
# "file": "ExportCopule-2019-retraites.json",
# "column_name": "pote_retraite",
# "col_name_export_pote": "retraites",
# },
]
sample_pop_ff, data_to_process = integration_data_ff(
sample_pop_ff, data_to_process, nb_tirage=10
)
# sample_pop_ff
# data_to_process
sample_pop_ff[
[
"pote_rev_capital",
]
].describe()
def print_var_erfs(var):
print(
f"Le montant total corrigé pour la population de la variable {var} et de {(sample_pop_ff[var] * sample_pop_ff.wprm).sum():,.0f} €"
)
print_var_erfs("pote_rev_capital")
df = sample_pop_ff
df["pote_rev_capital_wprm"] = df["pote_rev_capital"] * df.wprm
vaex_df = pandas_to_vaex(df).sort("rfr")
_ = vaex_df.rename("rfr", "revkire")
tiny_buckets = get_calib(
vaex_df, "pote_rev_capital_wprm", 40, nb_respect_secret_statistique=1
)
tiny_buckets["buckets"][1]
df_quantile = calib_to_df(tiny_buckets)
df_quantile.columns
dfsub = df_quantile
ax = sns.barplot(x="mean_tranche_var", y="sum_tranche_var", data=dfsub, ci=None)
_ = ax.tick_params(axis="x", rotation=90)
xticklabels = [f"{float(t.get_text()):,.0f} €" for t in ax.get_xticklabels()]
_ = ax.set_xticklabels(xticklabels)
_ = ax.yaxis.set_major_formatter(
ticker.FuncFormatter(lambda x, pos: f"{x / 1e9:,.0f}" + " mds €")
)
_ = ax.set_xlabel("Moyenne des revenus du capital", fontsize=16)
_ = ax.set_ylabel("Somme des revenus du capital", fontsize=16)
_ = ax.set_title(
f"Barre des revenus du capital sur l'ERFS-FPR {config.get('YEAR_ERFS')}",
fontsize=18,
)
dfsub = df_quantile.query("mean_tranche_var > 2")
ax = sns.barplot(x="mean_tranche_var", y="nombre_ff_tranche", data=dfsub, ci=None)
_ = ax.tick_params(axis="x", rotation=90)
xticklabels = [f"{float(t.get_text()):,.0f} €" for t in ax.get_xticklabels()]
_ = ax.set_xticklabels(xticklabels)
# _ = ax.yaxis.set_major_formatter(
# ticker.FuncFormatter(lambda x, pos: f"{x / 1e9:,.0f}" + " mds €")
# )
_ = ax.set_xlabel("Moyenne des revenus du capital", fontsize=16)
_ = ax.set_ylabel("Nombre de foyer", fontsize=16)
_ = ax.set_title(
f"Barre des revenus du capital sur l'ERFS-FPR {config.get('YEAR_ERFS')}",
fontsize=18,
)
config.get("COPULES")
with open(
"/mnt/data-in/casd_extract/pote/CalibPote-2019-assiette_csg_revenus_capital.json"
) as fichier:
calib = json.load(fichier)
df = calib_to_df(calib)
df
quantiles_distrib = distrib_to_quantiles(sample_pop_ff, "pote_rev_capital_wprm", calib)
calib_to_df(quantiles_distrib[2])
XXXX
salaire_max = 300_000
dfsub = sample_pop_ff.query("rfr < @salaire_max and pote_rev_capital < @salaire_max")
ax = sns.scatterplot(
data=dfsub,
x="rfr",
y="pote_rev_capital",
alpha=0.2,
label="Revenus du capital généré",
)
_ = ax.set_title(
f"Evolution des revenus du capital < 250 000 € par foyer en fonction du RFR sur l'ERFS-FPR {config.get('YEAR_ERFS')}",
fontsize=18,
)
Q : comment savoir ?
R :
dfsub = sample_pop_ff.query("pote_rev_capital > 0")
ax = sns.histplot(
data=dfsub[
[
"pote_rev_capital",
]
],
kde=True,
log_scale=True,
bins=50,
)
xticks = np.logspace(0, 7, num=30)
xlabels = [f"{j:,.0f} €" for j in xticks]
_ = ax.set_xticks(xticks)
_ = ax.set_xticklabels(xlabels, rotation=75)
_ = ax.set_xlabel("Revenus du capital", fontsize=16)
_ = ax.set_ylabel("Nombre de foyers", fontsize=16)
_ = ax.set_title(
f"Histogramme des revenus du capital sur l'ERFS-FPR {config.get('YEAR_ERFS')}\n(échelle logarithmique)",
fontsize=18,
)
dfsub = sample_pop_ff.query("pote_rev_capital > 100_000")
ax = sns.histplot(
data=dfsub[
[
"pote_rev_capital",
]
],
kde=True,
log_scale=True,
bins=50,
)
xticks = np.logspace(5, 7, num=20)
xlabels = [f"{j:,.0f} €" for j in xticks]
_ = ax.set_xticks(xticks)
_ = ax.set_xticklabels(xlabels, rotation=75)
_ = ax.set_xlabel("Revenus du capital", fontsize=16)
_ = ax.set_ylabel("Nombre de foyers", fontsize=16)
_ = ax.set_title(
f"Histogramme des revenus du capital > 100 000 € sur l'ERFS-FPR {config.get('YEAR_ERFS')}\n(échelle logarithmique)",
fontsize=18,
)
!ls /mnt/data-out/leximpact
data_to_process
for data in data_to_process:
pote_sum = aggregats.get_aggregate(
config.get("YEAR_POTE"), data["col_name_export_pote"], "POTE", "sum"
)
erfs_sum = (sample_pop_ff[data["column_name"]] * sample_pop_ff["wprm"]).sum()
print(
f"{data['col_name_export_pote']} POTE={pote_sum:,}€ ERFS={erfs_sum:,.0f} € ecart={pote_sum - erfs_sum:,.0f}€ soit {100*erfs_sum/pote_sum:,.0f}%"
)
# Check that we insert
tc.assertGreater(100 * erfs_sum / pote_sum, 90)
tc.assertLess(100 * erfs_sum / pote_sum, 110)
variable_type = "sum"
aggregats.get_aggregate("2018", "revenus_capitaux_prelevement_bareme", "POTE", "sum")
year = config.get("YEAR_ERFS")
get_ratios(year)
new_columns = [data["column_name"] for data in data_to_process]
new_columns
# On sépare l'échantillon de foyers fiscaux en individus
sample_ff_to_merge = sample_pop_ff[["idfoy"] + new_columns]
cols_declarant_principal = new_columns
sample_pop_ind = foyers_fiscaux_to_individus(
erfs_03, sample_ff_to_merge, new_columns, new_columns
)
print(sample_pop_ind.columns)
sample_pop_ind_of = convert_to_openfisca(sample_pop_ind, year)
sample_pop_ind_of.drop(new_columns, axis=1, inplace=True)
sample_pop_ind_of.columns
sample_pop_ind_of.describe()
TODO: :
outfile_path = (
config.get("DATA_OUT") + "erfs_var_copules_" + config.get("YEAR_ERFS") + "_dev.h5"
)
sample_pop_ind_of.to_hdf(outfile_path, key="input", mode="w")