diff --git a/notebooks/extractions_dads_postes/20_Convert_SAS_DADS.ipynb b/notebooks/extractions_dads_postes/20_Convert_SAS_DADS.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..202121b75595086dcf92b734a86f489c1df42e2f --- /dev/null +++ b/notebooks/extractions_dads_postes/20_Convert_SAS_DADS.ipynb @@ -0,0 +1,250 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CASD : Conversion de l'extraction SAS en Apache Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "InteractiveShell.ast_node_interactivity = \"all\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "year = \"2020\"\n", + "# year = \"2018\"\n", + "SAS_FILE = (\n", + " r\"C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\sas/\"\n", + " + \"extrait_dads_2020.sas7bdat\"\n", + ")\n", + "\n", + "OUT_PATH = r\"C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\chunks\"\n", + "OUT_PATH = OUT_PATH + \"extrait_dads_\" + year + r\"-chunk/\"\n", + "taille_chunk = 600_000" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import shutil\n", + "from pathlib import Path\n", + "from time import time\n", + "\n", + "import pandas as pd\n", + "import vaex\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def clean_chunk(chunk):\n", + " chunk.columns = [c.lower() for c in chunk.columns.to_list()]\n", + " # Remplacement des NaN par 0 dans les colonnes de données discrètes : non catégorielles.\n", + " for col in \"EFF_3112 S_BRUT PEPA NET\".lower().split(\" \"):\n", + " chunk[col].fillna(0, inplace=True)\n", + " return chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lecture du fichier SAS\n", + "\n", + "On va lire le fichier par morceau de 1 million de lignes, pour ne pas saturer la mémoire. Il y a 39 millions de lignes.\n", + "\n", + "On va les enregistrer au fur et à mesure en format Apache Arrow." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nombre d'itérations : 103\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "103it [18:05, 10.54s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 17min 41s\n", + "Wall time: 18min 5s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "%%time\n", + "# Temps sur CASD : < 20 minutes.\n", + "\n", + "# Efface le dossier de sortie\n", + "shutil.rmtree(OUT_PATH, ignore_errors=True)\n", + "Path(OUT_PATH).mkdir(parents=True, exist_ok=True)\n", + "\n", + "dfi = pd.read_sas(\n", + " SAS_FILE, chunksize=taille_chunk, encoding=\"iso8859-15\", iterator=True\n", + ")\n", + "\n", + "dd_values = None\n", + "i = 0\n", + "print(f\"Nombre d'itérations : {61_689_822/taille_chunk:.0f}\")\n", + "for chunk in tqdm(dfi):\n", + " del dd_values\n", + " dd_values = None\n", + " chunk = clean_chunk(chunk)\n", + " dd_values = vaex.from_pandas(chunk, copy_index=False)\n", + " dd_values.export(f\"{OUT_PATH}{year}_{i}.parquet\")\n", + " #### DEBUG\n", + " i += 1\n", + "# if i>=2:\n", + "# break\n", + " #### DEBUG" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 489822 entries, 61200000 to 61689821\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 a17 283843 non-null object \n", + " 1 a88 283843 non-null object \n", + " 2 contrat_travail 283842 non-null object \n", + " 3 cris 283747 non-null object \n", + " 4 eff_3112 489822 non-null float64\n", + " 5 motifcdd 121886 non-null object \n", + " 6 net 489822 non-null float64\n", + " 7 pepa 489822 non-null float64\n", + " 8 s_brut 489822 non-null float64\n", + " 9 treffect 283845 non-null object \n", + "dtypes: float64(4), object(6)\n", + "memory usage: 37.4+ MB\n" + ] + } + ], + "source": [ + "chunk.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option(\"display.max_columns\", None)\n", + "chunk" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# chunk.query(\"Z1cj > 0 and revkire < 10000\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# chunk.query(\"impot != impotnet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# chunk[chunk['srbg'] < 0]['rbg'].min()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#dd_values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "leximpact-prepare-data-kernel", + "language": "python", + "name": "leximpact-prepare-data-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/extractions_dads_postes/30_DADS-Quantiles.ipynb b/notebooks/extractions_dads_postes/30_DADS-Quantiles.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..25b55c60b10ee2924319c54c59f5ffb28969075b --- /dev/null +++ b/notebooks/extractions_dads_postes/30_DADS-Quantiles.ipynb @@ -0,0 +1,820 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "284168c2-c515-442a-8943-638ab2487933", + "metadata": {}, + "source": [ + "# CASD : Extraction d'agrégats\n", + "\n", + "TODO :\n", + "- Keep Secret des quantiles => Il faut la version 0.0.17\n", + "- Copule PEPA\n", + "- Count des variables catégorielles discrètes\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3b1b7645-f0a7-4929-a667-3ffa31e1b4db", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "InteractiveShell.ast_node_interactivity = \"all\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e888519d-ca9f-404a-b188-49de0bf72e31", + "metadata": {}, + "outputs": [], + "source": [ + "year = \"2020\"\n", + "# year = \"2018\"\n", + "# C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\chunks\\extrait_dads_2020-chunk\n", + "OUT_PATH = r\"C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\DADS/\"\n", + "ARROW_PATH = OUT_PATH + \"/../chunks/extrait_dads_\" + year + r\"-chunk/\"\n", + "taille_chunk = 2 * 2**20 # 2**20 = 1_048_576\n", + "# taille_chunk = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5e63307c-d42d-43d6-9ce3-8ea904a338eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.0.15'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import leximpact_prepare_data\n", + "\n", + "leximpact_prepare_data.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d5c9f1f6-e464-48d1-a933-421ad58a270a", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import shutil\n", + "from pathlib import Path\n", + "from time import time\n", + "\n", + "import pandas as pd\n", + "import vaex\n", + "import gc\n", + "from tqdm import tqdm\n", + "\n", + "from leximpact_prepare_data.calib_and_copules import *" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "805afac3-cab5-4795-8c93-84cd0e99d45f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 438 ms\n", + "Wall time: 474 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "dfv = vaex.open(ARROW_PATH + \"*\")\n", + "tc.assertEqual(len(dfv), 61_689_822 )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2e8a14fa-6b7a-4a3a-95d2-c559dbd2b20e", + "metadata": {}, + "outputs": [], + "source": [ + "# dfv.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7d55425f-af65-4f69-9b84-270020a8122e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['a17',\n", + " 'a88',\n", + " 'contrat_travail',\n", + " 'cris',\n", + " 'eff_3112',\n", + " 'motifcdd',\n", + " 'net',\n", + " 'pepa',\n", + " 's_brut',\n", + " 'treffect']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfv.get_column_names()" + ] + }, + { + "cell_type": "markdown", + "id": "04e671d2-ade2-48ac-95df-3344228bf13b", + "metadata": {}, + "source": [ + "## Variables catégorielles" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a16822e7-c562-4a48-87dd-849cf8ac75da", + "metadata": {}, + "outputs": [], + "source": [ + "categorical = [\n", + " 'A17',\n", + " 'A88',\n", + " 'CONTRAT_TRAVAIL',\n", + " 'CRIS',\n", + " 'MOTIFCDD',\n", + " 'TREFFECT'\n", + "]\n", + "categorical = [c.lower() for c in categorical]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aed1cf7-8a89-4cca-b4bf-052274850116", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "import numpy as np\n", + "\n", + "# Temps d'éxécution : 11s\n", + "\n", + "for col in tqdm(categorical):\n", + " df_col = dfv.groupby(\n", + " by=col,\n", + " sort=True,\n", + " agg={\n", + " \"count\": vaex.agg.count(col),\n", + " },\n", + " )\n", + " df = df_col.to_pandas_df()\n", + " df.loc[df[\"count\"] <= 12, \"count\"] = np.nan\n", + " df.to_csv(f\"{OUT_PATH}count_{year}_{col}.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "0653c281-5687-4542-8ff1-bd70dace2410", + "metadata": {}, + "source": [ + "## Variables continues" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7985d253-e7d1-49b1-8d96-372b2002cef0", + "metadata": {}, + "outputs": [], + "source": [ + "continuous_variables = [\n", + " 'EFF_3112',\n", + " 'NET',\n", + " 'PEPA',\n", + " 'S_BRUT',\n", + "]\n", + "continuous_variables = [c.lower() for c in continuous_variables]" + ] + }, + { + "cell_type": "markdown", + "id": "55cd1255-bc37-4cb1-9cd2-7fb55b104a6a", + "metadata": {}, + "source": [ + "### Calcul d'agregats" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ed98c3b0-76e3-46e3-8992-290a5c64b77a", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_agg(vdf, columns):\n", + " sub_total = []\n", + " vdf.fillna(column_names=columns, value=0, inplace=True)\n", + " # vdf.fillnan(column_names=columns, value=0, inplace=True)\n", + " ldf = vdf.shape[0]\n", + " columns = columns if columns else vdf.get_column_names()\n", + " for col in tqdm(columns):\n", + " # print(col)\n", + " name = f\"{col}_non_zero\"\n", + " vdf.select(f\"{col} != 0\", name=name)\n", + " nb_no_zero = int(vdf.count(\"*\", selection=name))\n", + " lenzero = ldf - nb_no_zero\n", + " dict_col = {\n", + " \"name\": col,\n", + " \"nb_line\": ldf,\n", + " \"lenzero\": lenzero,\n", + " \"pct_zero\": lenzero / ldf * 100,\n", + " \"sum\": int(vdf.sum(col)),\n", + " \"mean\": float(vdf.mean(col, selection=name)) if nb_no_zero > 0 else 0.0,\n", + " \"variance\": float(vdf.var(col, selection=name)) if nb_no_zero > 0 else 0.0,\n", + " \"std_dev\": float(vdf.std(col, selection=name)) if nb_no_zero > 0 else 0.0,\n", + " }\n", + " sub_total.append(dict_col)\n", + " return pd.DataFrame(sub_total)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "13f5536a-75a9-4df7-8db8-0833ad8c8ac5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████| 4/4 [00:36<00:00, 9.14s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 57.5 s\n", + "Wall time: 36.6 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "%%time\n", + "# Temps sur CASD : 30s par colonne\n", + "df_agg = compute_agg(\n", + " dfv, continuous_variables\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1e50b232-be30-4194-a4cf-b987cb3aeda3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>name</th>\n", + " <th>nb_line</th>\n", + " <th>lenzero</th>\n", + " <th>pct_zero</th>\n", + " <th>sum</th>\n", + " <th>mean</th>\n", + " <th>variance</th>\n", + " <th>std_dev</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>EFF_3112</td>\n", + " <td>61689822</td>\n", + " <td>21478958</td>\n", + " <td>34.817668950317284</td>\n", + " <td>183166377917</td>\n", + " <td>4,555.146537438241</td>\n", + " <td>371,755,244.70450455</td>\n", + " <td>19,280.955492519155</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>NET</td>\n", + " <td>61689822</td>\n", + " <td>29780878</td>\n", + " <td>48.27518873372661</td>\n", + " <td>490547776913</td>\n", + " <td>15,373.36293277926</td>\n", + " <td>557,799,386.3548901</td>\n", + " <td>23,617.77691390317</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>PEPA</td>\n", + " <td>61689822</td>\n", + " <td>56327418</td>\n", + " <td>91.30747370287435</td>\n", + " <td>3173478173</td>\n", + " <td>591.8013960933506</td>\n", + " <td>176,410.5382527109</td>\n", + " <td>420.01254535158006</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>S_BRUT</td>\n", + " <td>61689822</td>\n", + " <td>17053834</td>\n", + " <td>27.644485665722947</td>\n", + " <td>818402937097</td>\n", + " <td>18,335.04698265482</td>\n", + " <td>729,497,116.5502262</td>\n", + " <td>27,009.204293179235</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " name nb_line lenzero pct_zero sum \\\n", + "0 EFF_3112 61689822 21478958 34.817668950317284 183166377917 \n", + "1 NET 61689822 29780878 48.27518873372661 490547776913 \n", + "2 PEPA 61689822 56327418 91.30747370287435 3173478173 \n", + "3 S_BRUT 61689822 17053834 27.644485665722947 818402937097 \n", + "\n", + " mean variance std_dev \n", + "0 4,555.146537438241 371,755,244.70450455 19,280.955492519155 \n", + "1 15,373.36293277926 557,799,386.3548901 23,617.77691390317 \n", + "2 591.8013960933506 176,410.5382527109 420.01254535158006 \n", + "3 18,335.04698265482 729,497,116.5502262 27,009.204293179235 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.set_option(\"display.float_format\", \"{:,}\".format)\n", + "# Export dans un fichier\n", + "df_agg.to_csv(OUT_PATH + \"/agregats_DADS_\" + year + \".csv\", index=False)\n", + "df_agg" + ] + }, + { + "cell_type": "markdown", + "id": "7f1e01ef-7303-4bfd-9d60-a28f861c56bd", + "metadata": {}, + "source": [ + "### Calcul des quantiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da0ed4e7-0836-43c1-a99e-c33a5236f7d9", + "metadata": {}, + "outputs": [], + "source": [ + "def compute_quantile(vdf, columns=None, quantiles=10):\n", + " sub_total = []\n", + " vdf.fillna(column_names=columns, value=0, inplace=True)\n", + " # vdf.fillnan(column_names=columns, value=0, inplace=True)\n", + " ldf = vdf.shape[0]\n", + " columns = columns if columns else vdf.get_column_names()\n", + " for col in tqdm(columns):\n", + " try:\n", + " # print(col)\n", + " q = Quantile(vdf[col].tolist())\n", + " for quantile in quantiles:\n", + " q_dict = q.get_quantile(quantile)\n", + " #keep_upper_bound_secret(q_dict)\n", + " with open(f\"{OUT_PATH}/quantile_DADS_{quantile}_{year}_{col}.json\", \"w\") as f:\n", + " f.write(json.dumps(q_dict))\n", + " del q\n", + " gc.collect()\n", + " except Exception as e:\n", + " print(f\"ERROR processing {col} {e.__class__.__name__} : {e}\")\n", + " continue" + ] + }, + { + "cell_type": "markdown", + "id": "e79ad721-692c-4f5f-830e-8940e0f30be2", + "metadata": {}, + "source": [ + "#### Déciles et centiles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5960386d-494f-4305-ac33-d98381478b54", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "# Temps sur CASD : 5 minutes par colonne\n", + "compute_quantile(\n", + " dfv, continuous_variables, quantiles=[10,100]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f751954f-2f4f-4ae7-b4e7-018352450aef", + "metadata": {}, + "outputs": [], + "source": [ + "dfv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afc626b2-41a2-4d0f-91b2-0b5d83fe6106", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "3390e234-222d-43dc-98c0-3d3d187c7f37", + "metadata": {}, + "source": [ + "### Calcul de calibration" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "57084940-2b4b-4004-a14c-ecdd31ab24d7", + "metadata": {}, + "outputs": [], + "source": [ + "calib_base_variable = \"s_brut\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d8aa5ab7-12d0-4c42-a291-a144f9c56de5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 15.6 s\n", + "Wall time: 18.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# dfv[\"CONTRAT_TRAVAIL\"] = dfv[\"CONTRAT_TRAVAIL\"].fillna(0)\n", + "# dfv[\"CONTRAT_TRAVAIL\"] = dfv[\"CONTRAT_TRAVAIL\"].astype('int')\n", + "# dfv = dfv.sort(\"CONTRAT_TRAVAIL\")\n", + "\n", + "##dfv[\"S_BRUT\"] = dfv[\"S_BRUT\"].astype('int')\n", + "dfv = dfv.sort(calib_base_variable)\n", + "# tc.assertEqual(dfv[\"S_BRUT\"].count(), 39264696)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "501e3978-4328-4a2b-b8d4-0c2c1d275573", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 20.7 s\n", + "Wall time: 21 s\n" + ] + } + ], + "source": [ + "%%time\n", + "# < 1 minute\n", + "une_tranche_rfr = get_primary_buckets(dfv, 1, variable_to_split_on=calib_base_variable)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c669f3c0-fc59-4cf9-b0f7-12f4f9aab6eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'borders_values': [0, 1000000000000000], 'borders': [61689822]}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "une_tranche_rfr" + ] + }, + { + "cell_type": "markdown", + "id": "36e93ee0-d296-454c-b384-831aa2a3785d", + "metadata": {}, + "source": [ + "#### On sort les distributions de chaque variable continue" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e104fae0-e832-4664-9a6a-00bf75ff6a5c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|███████████████████████████████████████████| 4/4 [17:12<00:00, 258.23s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 17min 2s\n", + "Wall time: 17min 12s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "%%time\n", + "# Temps sur CASD : 30 minutes pour 4 colonnes\n", + "nb_bucket_var = 100\n", + "\n", + "for variable in tqdm(continuous_variables):\n", + " try:\n", + " calib = get_copulas(dfv, calib_base_variable, variable, nb_bucket_var, une_tranche_rfr)\n", + " calib = calib[\"copules\"][0][\"buckets\"]\n", + " if type(calib) is str:\n", + " print(f\"ERROR {variable} calib is '{calib}'\")\n", + " continue\n", + " keep_upper_bound_secret(calib)\n", + " with open(\n", + " f\"{OUT_PATH}Distrib_DADS-{nb_bucket_var}-{year}-{variable}.json\", \"w\"\n", + " ) as f:\n", + " f.write(json.dumps(calib))\n", + " except Exception as e:\n", + " print(f\"ERROR processing {variable}\", e)\n", + " raise e" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "112972db-7b3a-4de2-b5ed-02ae91ecaaa8", + "metadata": {}, + "outputs": [], + "source": [ + "# from IPython.display import JSON\n", + "\n", + "# JSON(calib)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "be197441-db93-43fc-9544-782eb6748ecd", + "metadata": {}, + "outputs": [], + "source": [ + "# %%time\n", + "# # Temps sur CASD : 138s par iteration\n", + "# nb_bucket_var = 100\n", + "\n", + "# for variable in tqdm(continuous_variables):\n", + "# #calib = get_calib(dfv, variable, nb_bucket_var)\n", + "# # print(variable)\n", + "# calib = compute_copule_vaex(dfv, variable, nb_bucket_var, une_tranche_rfr)\n", + "# calib[\"copules\"][0][\"buckets\"][-1][\"seuil_var_supp\"] = \"secret\"\n", + "# with open(f\"{OUT_PATH}CalibPote-{nb_bucket_var}-{year}-{variable}.json\", \"w\") as f:\n", + "# f.write(json.dumps(calib[\"copules\"][0][\"buckets\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7526cee8-ec5e-40f8-848f-be82dce7972d", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "# %%time\n", + "# # Temps sur CASD : 538s par iteration !\n", + "# nb_bucket_var = 1000\n", + "\n", + "# for variable in tqdm(continuous_variables):\n", + "# #calib = get_calib(dfv, variable, nb_bucket_var)\n", + "# # print(variable)\n", + "# calib = compute_copule_vaex(dfv, variable, nb_bucket_var, une_tranche_rfr)\n", + "# calib[\"copules\"][0][\"buckets\"][-1][\"seuil_var_supp\"] = \"secret\"\n", + "# with open(f\"{OUT_PATH}CalibPote-{nb_bucket_var}-{year}-{variable}.json\", \"w\") as f:\n", + "# f.write(json.dumps(calib[\"copules\"][0][\"buckets\"]))" + ] + }, + { + "cell_type": "markdown", + "id": "98f7a2a0-d29b-44ce-97c0-61f68ca9dc67", + "metadata": {}, + "source": [ + "#### Extraction de Copules\n", + "\n", + "On ne sort des copules que pour:\n", + "- PEPA par salaire brut\n", + "- PEPA par type de contrat de travail\n", + "- PEPA par secteur d'activité" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "da73e5f7-a38b-4fc5-9166-936723b1375a", + "metadata": {}, + "outputs": [], + "source": [ + "# copules_frf = get_copules(dfv, 10, \"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3d9cab3d-41cc-4d67-9fcc-c110fbbfc706", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "a2555b60-b565-404f-9748-e9fe3f198d1f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/1 [00:00<?, ?it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ERROR processing pepa DistribDeVar : WARNING !!!!, moins de 12 éléments => On retourne une liste vide. !!!!!!!!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████| 1/1 [00:56<00:00, 56.44s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 1min 17s\n", + "Wall time: 1min 17s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "%%time\n", + "centile_rfr = get_primary_buckets(dfv, 10, variable_to_split_on=calib_base_variable)\n", + "\n", + "nb_bucket_var = 10\n", + "\n", + "for variable in tqdm( [\n", + " 'pepa',\n", + "]):\n", + " try:\n", + " copule = get_copulas(dfv, calib_base_variable, variable, nb_bucket_var, centile_rfr)\n", + " # copule[\"copules\"][0][\"buckets\"][-1][\"upper_bound\"] = \"secret\"\n", + " keep_upper_bound_secret(copule[\"copules\"])\n", + " with open(\n", + " f\"{OUT_PATH}CopuleDADS_{calib_base_variable}-{nb_bucket_var}-{year}-{variable}.json\", \"w\"\n", + " ) as f:\n", + " f.write(json.dumps(copule))\n", + " except Exception as e:\n", + " print(f\"ERROR processing {variable}\", e)\n", + " # raise e" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c402fc11-1eee-4fec-84e5-a5a8e1153db8", + "metadata": {}, + "outputs": [], + "source": [ + "del dfv\n", + "gc.collect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "leximpact-prepare-data-kernel", + "language": "python", + "name": "leximpact-prepare-data-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/extractions_dads_postes/40_categorical_bi-variate.ipynb b/notebooks/extractions_dads_postes/40_categorical_bi-variate.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..cf4d63eafe321b499b826354578529b89d1efdf8 --- /dev/null +++ b/notebooks/extractions_dads_postes/40_categorical_bi-variate.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8e189d9b-84a8-42a9-bb05-583fde8c66bf", + "metadata": {}, + "source": [ + "# Distrib bi-variée de catégorie\n", + "\n", + "Comme les copules ne fonctionnent pas avec des catégories, nous avons sorti des centiles pour chaque catégorie." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c7417c68-57fc-4b6e-b43c-4fdfd678716b", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.core.interactiveshell import InteractiveShell\n", + "\n", + "InteractiveShell.ast_node_interactivity = \"all\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "71d0d8c8-26ef-46ac-a97c-defe097cdd3f", + "metadata": {}, + "outputs": [], + "source": [ + "year = \"2020\"\n", + "# year = \"2018\"\n", + "# C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\chunks\\extrait_dads_2020-chunk\n", + "OUT_PATH = r\"C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\DADS/\"\n", + "ARROW_PATH = OUT_PATH + \"/../chunks/extrait_dads_\" + year + r\"-chunk/\"\n", + "taille_chunk = 2 * 2**20 # 2**20 = 1_048_576\n", + "# taille_chunk = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "37a530d4-41cd-4004-aebe-284e93eb2946", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.0.15'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import leximpact_prepare_data\n", + "\n", + "leximpact_prepare_data.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "bac5ed21-92fd-4826-9b65-67f65141be05", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import shutil\n", + "from pathlib import Path\n", + "from time import time\n", + "\n", + "import pandas as pd\n", + "import vaex\n", + "import gc\n", + "from tqdm import tqdm\n", + "\n", + "from leximpact_prepare_data.calib_and_copules import *" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8ba466b8-df04-4189-89b0-a0d70d2b929c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: total: 328 ms\n", + "Wall time: 339 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "dfv = vaex.open(ARROW_PATH + \"*\")\n", + "tc.assertEqual(len(dfv), 61_689_822 )" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "20d0c74b-7c87-41d7-b04a-8d536cb6730a", + "metadata": {}, + "outputs": [], + "source": [ + "# dfv.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ec44f102-96f2-4116-9795-75f4f705441b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['a17',\n", + " 'a88',\n", + " 'contrat_travail',\n", + " 'cris',\n", + " 'eff_3112',\n", + " 'motifcdd',\n", + " 'net',\n", + " 'pepa',\n", + " 's_brut',\n", + " 'treffect']" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dfv.get_column_names()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f6f107be-2315-4bce-b327-d58fd59eed52", + "metadata": {}, + "outputs": [], + "source": [ + "categorical = [\n", + " 'A17',\n", + " 'CONTRAT_TRAVAIL',\n", + " 'TREFFECT'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faefe272-577b-4ff9-b5c8-20ed1eb0fe2b", + "metadata": {}, + "outputs": [], + "source": [ + "for col_maj in tqdm(['CONTRAT_TRAVAIL','TREFFECT' ]):\n", + " df = pd.read_csv(r\"C:\\Users\\Public\\Documents\\TRAVAIL\\agregats\\data\\DADS\\count_2020_\" + col_maj + \".csv\", dtype={col_maj:str})\n", + " col = col_maj.lower()\n", + " secondary = \"pepa\"\n", + " nb_quantile = 100\n", + " for cat in df[col_maj].tolist():\n", + " # a == a is False if NaN\n", + " if cat == cat:\n", + " vdf_cat = dfv[dfv[col] == str(cat)]\n", + " #print(cat, vdf_cat.count())\n", + " q = Quantile(vdf_cat[secondary].tolist())\n", + " q_dict = q.get_quantile(nb_quantile)\n", + " #keep_upper_bound_secret(q_dict)\n", + " with open(f\"{OUT_PATH}/quantile_DADS_{col}_{cat}_{nb_quantile}_{year}_{col}.json\", \"w\") as f:\n", + " f.write(json.dumps(q_dict))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "623109a0-c58a-4f85-9660-b64149e9edbd", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "leximpact-prepare-data-kernel", + "language": "python", + "name": "leximpact-prepare-data-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/extractions_dads_postes/extraction_pepa_macron.sas b/notebooks/extractions_dads_postes/extraction_pepa_macron.sas new file mode 100644 index 0000000000000000000000000000000000000000..642a85be278ac6fd9187726fc54634862330bf27 --- /dev/null +++ b/notebooks/extractions_dads_postes/extraction_pepa_macron.sas @@ -0,0 +1,5 @@ +libname dads_src "\\casd.fr\casdfs\Projets\LEXIMPA\Data\DADS_DADS Postes_2020"; +libname outputs "C:\Users\Public\Documents\TRAVAIL\agregats\sas"; +data outputs.extrait_dads_2020; +set dads_src.post (keep=A17 A88 CRIS TREFFECT EFF_3112 CONTRAT_TRAVAIL MOTIFCDD S_BRUT PEPA NET); +run; diff --git a/notebooks/extractions_dads_postes/recherche_agricole.sas b/notebooks/extractions_dads_postes/recherche_agricole.sas new file mode 100644 index 0000000000000000000000000000000000000000..8caa6bda0040c52c3e6c7096a61aa3c42b37a3c9 --- /dev/null +++ b/notebooks/extractions_dads_postes/recherche_agricole.sas @@ -0,0 +1,25 @@ +/* exploration des donn�es du secteur agricole dans DADS Postes 2019 (1/12�me) */ + +/* ELIGIBLES TO=DE : */ +/* A88 = '01' (culture, �levage et chasse)*/ +/* CONTRAT_TRAVAIL = '02' => cdd */ +/* MOTIFCDD IN ('03', '04', '05') => emplois � caract�re saisonnier, contrat vendanges et contrat d'usage */ +/* CONTRAT_TRAVAIL = '93' => contrat aid� (plusieurs sortes dont contrat d'insertion) */ +/* CONTRAT_TRAVAIL = '95' => travail occasionnel (saisonnier, occasionnel) */ +/* ! LIMITES : pr�cision sur contrat d'insertion (CDDI ou CDD CIE) et cas CDI demandeur d'emploi groupement d'employeurs tous en activit�s �ligibles */ + +proc sql; +SELECT count(A88) AS NB_TRAVAILLEURS_OCCASIONNELS FROM Dad12e19.Post12 +WHERE A88 = '01' AND CONTRAT_TRAVAIL = '95'; + +/* SELECT * FROM Dad12e19.Post12 */ +SELECT CONTRAT_TRAVAIL, MOTIFCDD, BRUT_F, BRUT_S, NET FROM Dad12e19.Post12 +WHERE ( + A88 = '01' + AND ( + (CONTRAT_TRAVAIL = '02' AND MOTIFCDD IN ('03', '04', '05')) + OR CONTRAT_TRAVAIL = '93' + OR CONTRAT_TRAVAIL = '95' + ); +quit; + diff --git a/notebooks/extractions_dads_postes/recherche_pepa_macron.sas b/notebooks/extractions_dads_postes/recherche_pepa_macron.sas new file mode 100644 index 0000000000000000000000000000000000000000..74de13e1cae618a97bd18a4d1e0c2140809d89bd --- /dev/null +++ b/notebooks/extractions_dads_postes/recherche_pepa_macron.sas @@ -0,0 +1,6 @@ +/* exploration des donn�es DADS Postes 2019 (1/12�me) concernant la PEPA dite Prime Macron */ + +proc sql; +SELECT count(PEPA) AS NB_PEPA FROM Dad12e19.Post12 WHERE PEPA > 0; +SELECT sum(PEPA) AS SOMME_PEPA FROM Dad12e19.Post12; +quit;