diff --git a/notebook_gouv/prix_carburant_gouv.ipynb b/notebook_gouv/prix_carburant_gouv.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..841f87205f246738cd89e95c9a9e70161486c4e0 --- /dev/null +++ b/notebook_gouv/prix_carburant_gouv.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "id": "d60999c6-2ae5-430b-934c-a95d309a496c", + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "import os\n", + "import xml.etree.ElementTree as ET\n", + "import csv\n", + "import time\n", + "from urllib.request import urlretrieve\n", + "from datetime import date\n", + "from calendar import monthrange\n", + "\n", + "import pandas as pd\n", + "import requests" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "bbf067e2-95d6-4375-93f2-41ef842893b0", + "metadata": {}, + "outputs": [], + "source": [ + "#recupération des bases de donnée sur le site du gouvernement.\n", + "def recuperation_xml(date_debut,date_fin):\n", + " for date in range(date_debut, date_fin +1, 1):\n", + " directory_to_extract_to = os.path.join(\"unzip_file\")\n", + " path_to_zip_file = os.path.join(\"zip_file\",f\"PrixCarburants_annuel_{date}.zip\")\n", + " urlretrieve(f\"https://donnees.roulez-eco.fr/opendata/annee/{date}\", path_to_zip_file)\n", + " with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:\n", + " zip_ref.extractall(directory_to_extract_to)\n", + "#recuperation_xml(2007,2021)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6c27528f-fbbd-4c34-86fe-a904c8181f77", + "metadata": {}, + "outputs": [], + "source": [ + "# utilisation de l'API de adress.data.gouv.fr pour passer de la latitude et longitude, au citycode\n", + "def citycode_from_lat_long(longitude,latitude):\n", + " url = f\"https://api-adresse.data.gouv.fr/reverse/?lon={longitude}&lat={latitude}\"\n", + " response = requests.get(url)\n", + " contenu = response.json() \n", + " features = contenu['features']\n", + " if len(features) == 0:\n", + " return None\n", + " else:\n", + " citycode = contenu['features'][0]['properties']['citycode']\n", + " return citycode" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d67ca228-3db6-446b-bcac-f1efafd129f6", + "metadata": {}, + "outputs": [], + "source": [ + "# passage du citycode au code du departement\n", + "def code_departement_from_citycode(citycode):\n", + " if citycode[ : 2] >= '97':\n", + " code_departement = citycode[ : 3]\n", + " else:\n", + " code_departement = citycode[ : 2]\n", + " return code_departement" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e8b5e2f4-2095-4c8f-a11d-d11de4cff76c", + "metadata": {}, + "outputs": [], + "source": [ + "# passage du code postal au code du departement\n", + "def code_departement_from_code_postal(code_postal):\n", + " if code_postal == '99999':\n", + " return None\n", + " elif code_postal[ : 2] >= '97':\n", + " code_departement = code_postal[ : 3]\n", + " elif code_postal[ : 3] in [\"200\",\"201\"] :\n", + " code_departement = \"2A\"\n", + " elif code_postal[ : 3] in [\"202\",\"206\"]:\n", + " code_departement = \"2B\"\n", + " else:\n", + " code_departement = code_postal[ : 2] \n", + " return code_departement" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "64a0d8fc-649a-4710-839e-416706a5f712", + "metadata": {}, + "outputs": [], + "source": [ + "# passage du code du departement au code région en utilisant l'API Métadonnées - V1 de l'INSEE\n", + "# documentation à API nomenclatures géographiques Insee\n", + "# attention, la clé doit être réactualisé tous les 7 jours...\n", + "# l'API est limité à 30 requêtes par minute\n", + "def code_region_from_code_departement(code_departement,date):\n", + " headers = {\n", + " 'Accept': 'application/json',\n", + " 'Authorization': 'Bearer 82590123-79ba-3b05-ad0c-fdfe657eaf7a', #Le changement est ici\n", + " }\n", + " params = {\n", + " 'date': date,\n", + " }\n", + " response = requests.get(f'https://api.insee.fr/metadonnees/V1/geo/departement/{code_departement}/ascendants', params=params, headers=headers)\n", + " contenu = response.json()\n", + " time.sleep(2.1)\n", + " if isinstance(contenu,dict):\n", + " print(contenu)\n", + " return contenu[0]['code']" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "c5f67bd6-5cf9-4e09-a587-f4b2454f4618", + "metadata": {}, + "outputs": [], + "source": [ + "#Les APIs sont relativement fragile, il arrive qu'il y ai des erreurs 500 ou 502. \n", + "#Dans le cas là if faut supprimer l'année qui était en train de boucler de \"prix_by_region\".\n", + "#Il faut ensuite recommencer la boucle à partir de cette date.\n", + "def debug_if_error_500(date_debut,date_fin):\n", + " for region, prix_by_carburant in prix_by_region.items():\n", + " for carburant,prix_by_annee in prix_by_carburant.items():\n", + " for annee in range(date_debut,date_fin+1):\n", + " if annee in prix_by_annee:\n", + " del prix_by_annee[annee]\n", + "debug_if_error_500(2013,2013)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "14979ff2-770a-4a6c-8780-13a76a98512a", + "metadata": {}, + "outputs": [], + "source": [ + "tree = ET.parse('unzip_file/PrixCarburants_annuel_2021.xml')\n", + "pdv_liste = tree.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "bb42e6c2-f9e8-49da-a372-88b9b869993b", + "metadata": {}, + "outputs": [], + "source": [ + "citycode_lat_long = {} \n", + "prix_by_region = {}" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b8a5473c-902a-4d1f-9318-ad52e425cf3e", + "metadata": {}, + "outputs": [], + "source": [ + "#prix_by_region" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd9550a-5c9b-4787-a372-d4f8309eaf9d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2007\n" + ] + } + ], + "source": [ + "#boucle principale, qui récupére les données des fichiers XML,\n", + "#trouve le code région de chaque station, \n", + "#récupère les données importantes, dont le prix par jour, par carburant, par station,\n", + "#fait la moyenne par jour \n", + "for annee in range(2007,2022):\n", + " print(annee)\n", + " tree = ET.parse(f'unzip_file/PrixCarburants_annuel_{annee}.xml')\n", + " pdv_liste = tree.getroot()\n", + " date = f'{annee}-01-01'\n", + " region = {} \n", + " for pdv in pdv_liste:\n", + " longitude = pdv.attrib.get('longitude')\n", + " latitude = pdv.attrib.get('latitude')\n", + " citycode = None\n", + " if latitude and longitude:\n", + " lat_long = f\"{latitude},{longitude}\"\n", + " citycode = citycode_lat_long.get(lat_long)\n", + " if citycode is None:\n", + " citycode = citycode_from_lat_long(float(longitude)/100000,float(latitude)/100000)\n", + " if citycode is not None:\n", + " citycode_lat_long[lat_long] = citycode\n", + " code_departement = (\n", + " code_departement_from_code_postal(pdv.attrib['cp'])\n", + " if citycode is None\n", + " else code_departement_from_citycode(citycode)\n", + " )\n", + " if code_departement is None:\n", + " print('code_departement is None')\n", + " continue\n", + " code_region = region.get(code_departement) \n", + " if code_region is None:\n", + " code_region = code_region_from_code_departement(code_departement,date)\n", + " region[code_departement]= code_region\n", + " for prix_element in pdv:\n", + " if prix_element.tag != 'prix':\n", + " continue\n", + " if prix_element.attrib.get('maj') is None:\n", + " continue\n", + " if prix_element.attrib.get('nom') is None:\n", + " continue\n", + " if prix_element.attrib.get('valeur') is None:\n", + " continue\n", + " prix_by_carburant = prix_by_region.setdefault(code_region,{})\n", + "# prix_by_carburant = prix_by_region.get(code_region)\n", + "# if prix_by_carburant is None:\n", + "# prix_by_carburant = prix_by_region[code_region] = {}\n", + " if 'T' in prix_element.attrib['maj']:\n", + " date_prix = prix_element.attrib['maj'].split('T')[0]\n", + " else:\n", + " date_prix = prix_element.attrib['maj'].split(' ')[0]\n", + " annee_prix, mois_prix, jour_prix = date_prix.split('-')\n", + " annee_prix, mois_prix, jour_prix = int(annee_prix), int(mois_prix), int(jour_prix)\n", + " prix_by_annee = prix_by_carburant.setdefault(prix_element.attrib['nom'],{})\n", + " prix_by_mois = prix_by_annee.setdefault(annee_prix,{})\n", + " prix_by_jour = prix_by_mois.setdefault(mois_prix,{})\n", + " prix_by_station = prix_by_jour.setdefault(jour_prix,{})\n", + " prix_by_station[pdv.attrib['id']] = prix_element.attrib['valeur']\n", + " \n", + " for region, prix_by_carburant in prix_by_region.items():\n", + " stations = set()\n", + " prix_by_carburant = prix_by_region[region] \n", + " for carburant,prix_by_annee in prix_by_carburant.items():\n", + " dernier_prix_par_station = {}\n", + " prix_by_mois = prix_by_annee[annee]\n", + " for mois in range(1,13):\n", + " prix_by_jour = prix_by_mois.setdefault(mois,{})\n", + " dernier_jour = monthrange(annee, mois)[1]\n", + " for jour in range(1,dernier_jour+1):\n", + " prix_by_station = prix_by_jour.get(jour)\n", + " stations = stations.union(prix_by_station.keys())\n", + " for station in stations:\n", + " prix = prix_by_station.get(station)\n", + " if prix is None:\n", + " prix_by_station[station] = dernier_prix_par_station.get(station)\n", + " else:\n", + " dernier_prix_par_station[station] = prix\n", + "\n", + " for region, prix_by_carburant in prix_by_region.items():\n", + " for carburant, prix_by_annee in prix_by_carburant.items():\n", + " prix_by_mois = prix_by_annee[annee]\n", + " for annee, prix_by_mois in prix_by_annee.items():\n", + " for mois, prix_by_jour in prix_by_mois.items(): \n", + " for jour, prix_by_station in prix_by_jour.items():\n", + " count = 0\n", + " total = 0\n", + " for station, prix in prix_by_station.items():\n", + " if prix is not None:\n", + " total += float(prix)\n", + " count += 1 \n", + " prix_by_jour[jour] = round(total / count, 2) if count > 0 else None\n", + "\n", + " for region, prix_by_carburant in prix_by_region.items():\n", + " for carburant,prix_by_annee in prix_by_carburant.items():\n", + " prix_by_mois = prix_by_annee[annee]\n", + " count_annee = 0\n", + " total_annee = 0\n", + " for mois,prix_by_jour in prix_by_mois.items():\n", + " count_mois = 0\n", + " total_mois = 0\n", + " for jour, prix in prix_by_jour.items():\n", + " if prix is not None:\n", + " count_mois += 1\n", + " total_mois += prix\n", + " count_annee += 1\n", + " total_annee += prix\n", + " if count_mois == 0:\n", + " prix_by_mois[mois] = None\n", + " else:\n", + " prix_by_mois[mois] = round(total_mois / count_mois,2)\n", + " if count_annee == 0:\n", + " prix_by_mois['moyenne'] = None\n", + " else:\n", + " prix_by_mois['moyenne'] = round(total_annee / count_annee,2)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "0f26bf8a-397d-4522-8409-f9f4681ce870", + "metadata": {}, + "outputs": [], + "source": [ + "#Lisse le dictionnaire \"prix_by_region\".\n", + "liste_prix_mensuel=[] \n", + "liste_prix_annuel=[]\n", + "for region, prix_by_carburant in prix_by_region.items():\n", + " for carburant,prix_by_annee in prix_by_carburant.items():\n", + " for annee,prix_by_mois in prix_by_annee.items():\n", + " for mois,prix in prix_by_mois.items():\n", + " if prix_by_mois.values == 'moyenne':\n", + " pass\n", + " prix_region_mensuel = {\n", + " \"region\": region,\n", + " \"carburant\": carburant,\n", + " \"annee\": annee,\n", + " \"mois\": mois,\n", + " \"prix_moyen\": prix,\n", + " }\n", + " liste_prix_mensuel.append(prix_region_mensuel)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "8a712431-90ff-42bb-9449-3f89bbaf2a15", + "metadata": {}, + "outputs": [], + "source": [ + "#créer la dataframe \"prix_mensuel_final.csv\"\n", + "df = pd.DataFrame.from_dict(liste_prix_mensuel)\n", + "indexNames = df[ df['mois'] == 'moyenne' ].index\n", + "df.drop(indexNames , inplace=True)\n", + "df.reset_index(drop = True, inplace = True)\n", + "df['prix_moyen'] = round(df['prix_moyen'] * 0.001,2)\n", + "df.to_csv (r'prix_mensuel_final.csv', index = False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "0803570e-3b2c-4f0d-bc8b-aa34a3f6dfa6", + "metadata": {}, + "outputs": [], + "source": [ + "#créer la dataframe \"prix_annuel_final.csv\"\n", + "df = pd.DataFrame.from_dict(liste_prix_mensuel)\n", + "indexNames = df[ df['mois'] != 'moyenne' ].index\n", + "df.drop(indexNames , inplace=True)\n", + "df.reset_index(drop = True, inplace = True)\n", + "df.drop(columns=['mois'],inplace=True)\n", + "df['prix_moyen'] = round(df['prix_moyen'] * 0.001,2)\n", + "df.to_csv (r'prix_annuel_final.csv', index = False, header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7816857-8629-4f2a-b49f-3b4ca0c9fc16", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "indirect-taxation-kernel", + "language": "python", + "name": "indirect-taxation-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2007.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2007.zip new file mode 100644 index 0000000000000000000000000000000000000000..85cb395726b54cc0f1b3b7bb4296b6123b46ba69 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2007.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2008.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2008.zip new file mode 100644 index 0000000000000000000000000000000000000000..3f7ba00afcda1e64fe217e1f6c50dc0cdfa559f8 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2008.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2009.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2009.zip new file mode 100644 index 0000000000000000000000000000000000000000..754851c1c4e1e5aafc5c3c8db32705749c668646 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2009.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2010.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2010.zip new file mode 100644 index 0000000000000000000000000000000000000000..02a025fbe7c6d0ac6cd3d9cf00f0aeaa7f632004 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2010.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2011.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2011.zip new file mode 100644 index 0000000000000000000000000000000000000000..a85753a9d4ed7cf64b39141f880f3ac6b877dc23 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2011.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2012.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2012.zip new file mode 100644 index 0000000000000000000000000000000000000000..0039e1c9735f961eef16eeeb0693d7ad9d8081b6 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2012.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2013.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2013.zip new file mode 100644 index 0000000000000000000000000000000000000000..b0e6f6cdd2f2f81f238d103e46a6ced9857daf78 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2013.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2014.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2014.zip new file mode 100644 index 0000000000000000000000000000000000000000..40be5fee489d33ce4ec31149d8f202960392d8cf Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2014.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2015.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2015.zip new file mode 100644 index 0000000000000000000000000000000000000000..58279297001da60f88ce4a812c176a391145dc55 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2015.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2016.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2016.zip new file mode 100644 index 0000000000000000000000000000000000000000..dffc520ffd8250e416e24397ca8443d575bceb24 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2016.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2017.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2017.zip new file mode 100644 index 0000000000000000000000000000000000000000..14c3df7a16b31fbca9f36cf0979e53b9caa924ee Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2017.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2018.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2018.zip new file mode 100644 index 0000000000000000000000000000000000000000..3b6232d7992bdc64c6acb003b7bc6411746df009 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2018.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2019.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2019.zip new file mode 100644 index 0000000000000000000000000000000000000000..f68a427a127d62df039977e7123138f2897e6b27 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2019.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2020.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2020.zip new file mode 100644 index 0000000000000000000000000000000000000000..41a9a252b57ea555c356b1d5f82f37cf91b95731 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2020.zip differ diff --git a/notebook_gouv/zip_file/PrixCarburants_annuel_2021.zip b/notebook_gouv/zip_file/PrixCarburants_annuel_2021.zip new file mode 100644 index 0000000000000000000000000000000000000000..565575ccfded2ba7a7efd6ec158ab30b10ff0278 Binary files /dev/null and b/notebook_gouv/zip_file/PrixCarburants_annuel_2021.zip differ