--- title: Répartition de différentes variables (RFR, RK) dans la population pour 2019 keywords: fastai sidebar: home_sidebar nb_path: "notebooks/extractions_base_des_impots/plot_rfr_2019.ipynb" ---
import dask.dataframe as dd
import pandas as pd
import seaborn as sns
import dask.dataframe as dd
input_directory = (
r"C:\Users\Public\Documents\TRAVAIL\csg\data_in\extraction_assiettes_csg\*.hdf"
)
rfrs = dd.read_hdf(input_directory, "/pote2019")
rfrs.columns
df = rfrs[rfrs["revkire"] > 0].compute()
print(f"Somme du RFR de tous les foyers de POTE {df.revkire.sum():,} €")
nb_quantiles = 100
centile = [(1 / nb_quantiles) * (i + 1) for i in range(nb_quantiles)]
centile[49]
df_quantiles = df.quantile(centile)
df_quantiles = pd.DataFrame(df_quantiles)
df_quantiles["quantiles"] = df_quantiles.index * 100
df_quantiles["quantiles"] = df_quantiles["quantiles"].astype(int)
# parquet_path = r'C:\Users\Public\Documents\TRAVAIL\csg\data_in\assiettes_csg.parquet'
# rfrs = dd.read_parquet(parquet_path)
# rfrs.columns
sns.set(rc={"figure.figsize": (20, 8)})
ax = sns.barplot(data=df_quantiles, x="quantiles", y="revkire")
_ = ax.set_yscale("log")
_ = ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=90)
_ = ax.set_title("Centiles de RFR dans POTE 2019\nEchelle logarithmique")