--- title: Generate a fake POTE input file keywords: fastai sidebar: home_sidebar nb_path: "notebooks/extractions_base_des_impots/00_generate_fake_data.ipynb" ---
OUT_PATH = "/media/data-nvme/dev/src/LEXIMPACT/"
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import vaex
import seaborn as sns
!python --version
Voir la doc de scipy : https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genpareto.html
from scipy.stats import genpareto
c = 0.1
mean, var, skew, kurt = genpareto.stats(c, moments="mvsk")
print(f"{mean=} {var=} {skew=} {kurt=}")
x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100)
print(f"{x.shape=}")
fig, ax = plt.subplots(1, 1)
_ = ax.plot(x, genpareto.pdf(x, c), "r-", lw=5, alpha=0.6, label="genpareto pdf")
r = genpareto.rvs(c, size=100_000)
_ = ax.hist(r, density=True, histtype="stepfilled", alpha=0.2)
_ = ax.legend(loc="best", frameon=False)
plt.show()
sorted(r * 10e6)[10:2:-10]
Par exemple, ici on demande :
c = 0.5
r = genpareto.rvs(c, size=100, loc=0, scale=1_000, random_state=1)
count, bins, ax = plt.hist(r, 4, density=False)
print(f"{r.shape=} {r.min()=} {r.mean()=} {r.max()=}")
print(f"Frontière de l'histogramme :{bins}")
print(f"Nombre de personnes dans chaque bars :{count}")
On voit sur l'histogramme qu'on a 90% des personnes qui gagnent moins de 4 237 €
Et seulement 1% qui gagnent plus de 12 700 €, avec un maximum de 16 949 €
def get_random_data(nb_zeros: int, nb_above_zero: int, c=0.9, scale=80_000):
"""
::nb_zeros:: Nombre d'individus à 0
::nb_above_zero:: Nombre d'individus pour lesquels générer une valeur
"""
zeros = np.zeros((nb_zeros,))
# Using numpy.random.pareto() method
pareto = genpareto.rvs(c, size=nb_above_zero, loc=0, scale=scale, random_state=1)
return np.concatenate((zeros, pareto), axis=0)
c = 0.6
r = get_random_data(
nb_zeros=2_000_000, nb_above_zero=36_000_000, c=c, scale=4000 * (1 / c)
)
print(f"{r.shape=} {r.min()=} {r.mean()=} {r.max()=}")
df = pd.DataFrame({"revkire": r})
nb_quantiles = 100
centile = [(1 / nb_quantiles) * (i + 1) for i in range(nb_quantiles)]
centile[49]
quantiles = df.quantile(centile)
df_quantiles = pd.DataFrame(quantiles)
df_quantiles["quantiles"] = df_quantiles.index * 100
df_quantiles["quantiles"] = df_quantiles["quantiles"].astype(int)
sns.set(rc={"figure.figsize": (20, 8)})
ax = sns.barplot(data=df_quantiles, x="quantiles", y="revkire")
_ = ax.set_yscale("log")
_ = ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=90)
_ = ax.set_title("Centiles de RFR dans FAKE POTE 2019\nEchelle logarithmique")
On est très proche de ce qu'on l'on a vu sur POTE. 👏
col_to_fake = [
{
"name": "revkire",
"c": 0.6,
"nb_zeros": 2_000_000,
"scale": 4000 * (1 / 0.6),
},
{
"name": "rev_capital_partiel",
"c": 0.5,
"nb_zeros": 10_000_000,
"scale": 30,
},
{
"name": "rev_salaire",
"c": 0.6,
"nb_zeros": 2_000_000,
"scale": 1000,
},
{
"name": "rente_viagere",
"c": 0.5,
"nb_zeros": 10_000_000,
"scale": 30,
},
{
"name": "rev_categ_foncier",
"c": 0.8,
"nb_zeros": 20_000_000,
"scale": 30,
},
{
"name": "retraites",
"c": 0.8,
"nb_zeros": 20_000_000,
"scale": 20,
},
{
"name": "chomage",
"c": 0.1,
"nb_zeros": 30_000_000,
"scale": 50_000,
},
]
def gen_all_data(reduce_output_ratio=100, filepath=None):
"""
::reduce_output_ratio:: 1 for full data, 2 for alf.
::filepath:: Chemin et nom du fichier à sauver
"""
df_pote = None
nb_ff = 39_000_000
for col in col_to_fake:
values = get_random_data(
nb_zeros=col["nb_zeros"] // reduce_output_ratio,
nb_above_zero=(nb_ff - col["nb_zeros"]) // reduce_output_ratio,
c=col["c"],
scale=col["scale"],
)
df_temp = vaex.from_dict({col["name"]: values.astype(int)})
if df_pote is not None:
df_pote = df_pote.join(df_temp)
else:
df_pote = df_temp
if filepath:
df_pote.export_parquet(filepath)
return df_pote
df_pote = gen_all_data()
df_pote
# df_pote.describe()
df_pote_full = gen_all_data(
reduce_output_ratio=1, filepath=f"{OUT_PATH}/fake_pote_full.parquet"
)
df_pote_full.describe()
df_pote_sample = df_pote_full.sample(n=300_000)
df_pote_sample.export_parquet(f"{OUT_PATH}/fake_pote_partial.parquet")
df_pote_sample.describe()