From 55931d1343b4701e15b8acd18523e0a4bdbef113 Mon Sep 17 00:00:00 2001 From: kendrick <herzbergkendrick@gmail.com> Date: Wed, 22 Jun 2022 10:47:18 +0200 Subject: [PATCH] remove file --- notebook_gouv/prix_carburant_gouv.ipynb | 396 ------------------------ 1 file changed, 396 deletions(-) delete mode 100644 notebook_gouv/prix_carburant_gouv.ipynb diff --git a/notebook_gouv/prix_carburant_gouv.ipynb b/notebook_gouv/prix_carburant_gouv.ipynb deleted file mode 100644 index 841f872..0000000 --- a/notebook_gouv/prix_carburant_gouv.ipynb +++ /dev/null @@ -1,396 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "id": "d60999c6-2ae5-430b-934c-a95d309a496c", - "metadata": {}, - "outputs": [], - "source": [ - "import zipfile\n", - "import os\n", - "import xml.etree.ElementTree as ET\n", - "import csv\n", - "import time\n", - "from urllib.request import urlretrieve\n", - "from datetime import date\n", - "from calendar import monthrange\n", - "\n", - "import pandas as pd\n", - "import requests" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "bbf067e2-95d6-4375-93f2-41ef842893b0", - "metadata": {}, - "outputs": [], - "source": [ - "#recupération des bases de donnée sur le site du gouvernement.\n", - "def recuperation_xml(date_debut,date_fin):\n", - " for date in range(date_debut, date_fin +1, 1):\n", - " directory_to_extract_to = os.path.join(\"unzip_file\")\n", - " path_to_zip_file = os.path.join(\"zip_file\",f\"PrixCarburants_annuel_{date}.zip\")\n", - " urlretrieve(f\"https://donnees.roulez-eco.fr/opendata/annee/{date}\", path_to_zip_file)\n", - " with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:\n", - " zip_ref.extractall(directory_to_extract_to)\n", - "#recuperation_xml(2007,2021)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "6c27528f-fbbd-4c34-86fe-a904c8181f77", - "metadata": {}, - "outputs": [], - "source": [ - "# utilisation de l'API de adress.data.gouv.fr pour passer de la latitude et longitude, au citycode\n", - "def citycode_from_lat_long(longitude,latitude):\n", - " url = f\"https://api-adresse.data.gouv.fr/reverse/?lon={longitude}&lat={latitude}\"\n", - " response = requests.get(url)\n", - " contenu = response.json() \n", - " features = contenu['features']\n", - " if len(features) == 0:\n", - " return None\n", - " else:\n", - " citycode = contenu['features'][0]['properties']['citycode']\n", - " return citycode" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d67ca228-3db6-446b-bcac-f1efafd129f6", - "metadata": {}, - "outputs": [], - "source": [ - "# passage du citycode au code du departement\n", - "def code_departement_from_citycode(citycode):\n", - " if citycode[ : 2] >= '97':\n", - " code_departement = citycode[ : 3]\n", - " else:\n", - " code_departement = citycode[ : 2]\n", - " return code_departement" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "e8b5e2f4-2095-4c8f-a11d-d11de4cff76c", - "metadata": {}, - "outputs": [], - "source": [ - "# passage du code postal au code du departement\n", - "def code_departement_from_code_postal(code_postal):\n", - " if code_postal == '99999':\n", - " return None\n", - " elif code_postal[ : 2] >= '97':\n", - " code_departement = code_postal[ : 3]\n", - " elif code_postal[ : 3] in [\"200\",\"201\"] :\n", - " code_departement = \"2A\"\n", - " elif code_postal[ : 3] in [\"202\",\"206\"]:\n", - " code_departement = \"2B\"\n", - " else:\n", - " code_departement = code_postal[ : 2] \n", - " return code_departement" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "64a0d8fc-649a-4710-839e-416706a5f712", - "metadata": {}, - "outputs": [], - "source": [ - "# passage du code du departement au code région en utilisant l'API Métadonnées - V1 de l'INSEE\n", - "# documentation à API nomenclatures géographiques Insee\n", - "# attention, la clé doit être réactualisé tous les 7 jours...\n", - "# l'API est limité à 30 requêtes par minute\n", - "def code_region_from_code_departement(code_departement,date):\n", - " headers = {\n", - " 'Accept': 'application/json',\n", - " 'Authorization': 'Bearer 82590123-79ba-3b05-ad0c-fdfe657eaf7a', #Le changement est ici\n", - " }\n", - " params = {\n", - " 'date': date,\n", - " }\n", - " response = requests.get(f'https://api.insee.fr/metadonnees/V1/geo/departement/{code_departement}/ascendants', params=params, headers=headers)\n", - " contenu = response.json()\n", - " time.sleep(2.1)\n", - " if isinstance(contenu,dict):\n", - " print(contenu)\n", - " return contenu[0]['code']" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "c5f67bd6-5cf9-4e09-a587-f4b2454f4618", - "metadata": {}, - "outputs": [], - "source": [ - "#Les APIs sont relativement fragile, il arrive qu'il y ai des erreurs 500 ou 502. \n", - "#Dans le cas là if faut supprimer l'année qui était en train de boucler de \"prix_by_region\".\n", - "#Il faut ensuite recommencer la boucle à partir de cette date.\n", - "def debug_if_error_500(date_debut,date_fin):\n", - " for region, prix_by_carburant in prix_by_region.items():\n", - " for carburant,prix_by_annee in prix_by_carburant.items():\n", - " for annee in range(date_debut,date_fin+1):\n", - " if annee in prix_by_annee:\n", - " del prix_by_annee[annee]\n", - "debug_if_error_500(2013,2013)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "14979ff2-770a-4a6c-8780-13a76a98512a", - "metadata": {}, - "outputs": [], - "source": [ - "tree = ET.parse('unzip_file/PrixCarburants_annuel_2021.xml')\n", - "pdv_liste = tree.getroot()" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "bb42e6c2-f9e8-49da-a372-88b9b869993b", - "metadata": {}, - "outputs": [], - "source": [ - "citycode_lat_long = {} \n", - "prix_by_region = {}" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "b8a5473c-902a-4d1f-9318-ad52e425cf3e", - "metadata": {}, - "outputs": [], - "source": [ - "#prix_by_region" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cd9550a-5c9b-4787-a372-d4f8309eaf9d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2007\n" - ] - } - ], - "source": [ - "#boucle principale, qui récupére les données des fichiers XML,\n", - "#trouve le code région de chaque station, \n", - "#récupère les données importantes, dont le prix par jour, par carburant, par station,\n", - "#fait la moyenne par jour \n", - "for annee in range(2007,2022):\n", - " print(annee)\n", - " tree = ET.parse(f'unzip_file/PrixCarburants_annuel_{annee}.xml')\n", - " pdv_liste = tree.getroot()\n", - " date = f'{annee}-01-01'\n", - " region = {} \n", - " for pdv in pdv_liste:\n", - " longitude = pdv.attrib.get('longitude')\n", - " latitude = pdv.attrib.get('latitude')\n", - " citycode = None\n", - " if latitude and longitude:\n", - " lat_long = f\"{latitude},{longitude}\"\n", - " citycode = citycode_lat_long.get(lat_long)\n", - " if citycode is None:\n", - " citycode = citycode_from_lat_long(float(longitude)/100000,float(latitude)/100000)\n", - " if citycode is not None:\n", - " citycode_lat_long[lat_long] = citycode\n", - " code_departement = (\n", - " code_departement_from_code_postal(pdv.attrib['cp'])\n", - " if citycode is None\n", - " else code_departement_from_citycode(citycode)\n", - " )\n", - " if code_departement is None:\n", - " print('code_departement is None')\n", - " continue\n", - " code_region = region.get(code_departement) \n", - " if code_region is None:\n", - " code_region = code_region_from_code_departement(code_departement,date)\n", - " region[code_departement]= code_region\n", - " for prix_element in pdv:\n", - " if prix_element.tag != 'prix':\n", - " continue\n", - " if prix_element.attrib.get('maj') is None:\n", - " continue\n", - " if prix_element.attrib.get('nom') is None:\n", - " continue\n", - " if prix_element.attrib.get('valeur') is None:\n", - " continue\n", - " prix_by_carburant = prix_by_region.setdefault(code_region,{})\n", - "# prix_by_carburant = prix_by_region.get(code_region)\n", - "# if prix_by_carburant is None:\n", - "# prix_by_carburant = prix_by_region[code_region] = {}\n", - " if 'T' in prix_element.attrib['maj']:\n", - " date_prix = prix_element.attrib['maj'].split('T')[0]\n", - " else:\n", - " date_prix = prix_element.attrib['maj'].split(' ')[0]\n", - " annee_prix, mois_prix, jour_prix = date_prix.split('-')\n", - " annee_prix, mois_prix, jour_prix = int(annee_prix), int(mois_prix), int(jour_prix)\n", - " prix_by_annee = prix_by_carburant.setdefault(prix_element.attrib['nom'],{})\n", - " prix_by_mois = prix_by_annee.setdefault(annee_prix,{})\n", - " prix_by_jour = prix_by_mois.setdefault(mois_prix,{})\n", - " prix_by_station = prix_by_jour.setdefault(jour_prix,{})\n", - " prix_by_station[pdv.attrib['id']] = prix_element.attrib['valeur']\n", - " \n", - " for region, prix_by_carburant in prix_by_region.items():\n", - " stations = set()\n", - " prix_by_carburant = prix_by_region[region] \n", - " for carburant,prix_by_annee in prix_by_carburant.items():\n", - " dernier_prix_par_station = {}\n", - " prix_by_mois = prix_by_annee[annee]\n", - " for mois in range(1,13):\n", - " prix_by_jour = prix_by_mois.setdefault(mois,{})\n", - " dernier_jour = monthrange(annee, mois)[1]\n", - " for jour in range(1,dernier_jour+1):\n", - " prix_by_station = prix_by_jour.get(jour)\n", - " stations = stations.union(prix_by_station.keys())\n", - " for station in stations:\n", - " prix = prix_by_station.get(station)\n", - " if prix is None:\n", - " prix_by_station[station] = dernier_prix_par_station.get(station)\n", - " else:\n", - " dernier_prix_par_station[station] = prix\n", - "\n", - " for region, prix_by_carburant in prix_by_region.items():\n", - " for carburant, prix_by_annee in prix_by_carburant.items():\n", - " prix_by_mois = prix_by_annee[annee]\n", - " for annee, prix_by_mois in prix_by_annee.items():\n", - " for mois, prix_by_jour in prix_by_mois.items(): \n", - " for jour, prix_by_station in prix_by_jour.items():\n", - " count = 0\n", - " total = 0\n", - " for station, prix in prix_by_station.items():\n", - " if prix is not None:\n", - " total += float(prix)\n", - " count += 1 \n", - " prix_by_jour[jour] = round(total / count, 2) if count > 0 else None\n", - "\n", - " for region, prix_by_carburant in prix_by_region.items():\n", - " for carburant,prix_by_annee in prix_by_carburant.items():\n", - " prix_by_mois = prix_by_annee[annee]\n", - " count_annee = 0\n", - " total_annee = 0\n", - " for mois,prix_by_jour in prix_by_mois.items():\n", - " count_mois = 0\n", - " total_mois = 0\n", - " for jour, prix in prix_by_jour.items():\n", - " if prix is not None:\n", - " count_mois += 1\n", - " total_mois += prix\n", - " count_annee += 1\n", - " total_annee += prix\n", - " if count_mois == 0:\n", - " prix_by_mois[mois] = None\n", - " else:\n", - " prix_by_mois[mois] = round(total_mois / count_mois,2)\n", - " if count_annee == 0:\n", - " prix_by_mois['moyenne'] = None\n", - " else:\n", - " prix_by_mois['moyenne'] = round(total_annee / count_annee,2)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "0f26bf8a-397d-4522-8409-f9f4681ce870", - "metadata": {}, - "outputs": [], - "source": [ - "#Lisse le dictionnaire \"prix_by_region\".\n", - "liste_prix_mensuel=[] \n", - "liste_prix_annuel=[]\n", - "for region, prix_by_carburant in prix_by_region.items():\n", - " for carburant,prix_by_annee in prix_by_carburant.items():\n", - " for annee,prix_by_mois in prix_by_annee.items():\n", - " for mois,prix in prix_by_mois.items():\n", - " if prix_by_mois.values == 'moyenne':\n", - " pass\n", - " prix_region_mensuel = {\n", - " \"region\": region,\n", - " \"carburant\": carburant,\n", - " \"annee\": annee,\n", - " \"mois\": mois,\n", - " \"prix_moyen\": prix,\n", - " }\n", - " liste_prix_mensuel.append(prix_region_mensuel)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "8a712431-90ff-42bb-9449-3f89bbaf2a15", - "metadata": {}, - "outputs": [], - "source": [ - "#créer la dataframe \"prix_mensuel_final.csv\"\n", - "df = pd.DataFrame.from_dict(liste_prix_mensuel)\n", - "indexNames = df[ df['mois'] == 'moyenne' ].index\n", - "df.drop(indexNames , inplace=True)\n", - "df.reset_index(drop = True, inplace = True)\n", - "df['prix_moyen'] = round(df['prix_moyen'] * 0.001,2)\n", - "df.to_csv (r'prix_mensuel_final.csv', index = False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "0803570e-3b2c-4f0d-bc8b-aa34a3f6dfa6", - "metadata": {}, - "outputs": [], - "source": [ - "#créer la dataframe \"prix_annuel_final.csv\"\n", - "df = pd.DataFrame.from_dict(liste_prix_mensuel)\n", - "indexNames = df[ df['mois'] != 'moyenne' ].index\n", - "df.drop(indexNames , inplace=True)\n", - "df.reset_index(drop = True, inplace = True)\n", - "df.drop(columns=['mois'],inplace=True)\n", - "df['prix_moyen'] = round(df['prix_moyen'] * 0.001,2)\n", - "df.to_csv (r'prix_annuel_final.csv', index = False, header=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7816857-8629-4f2a-b49f-3b4ca0c9fc16", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "indirect-taxation-kernel", - "language": "python", - "name": "indirect-taxation-kernel" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} -- GitLab