from leximpact_common_python_libraries.config import Configuration
config = Configuration(project_folder="leximpact-prepare-data")Essais et développement de la méthode de calage sur marges
import unittest
tc = unittest.TestCase()import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers2022-01-17 21:41:55.542925: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-17 21:41:55.542951: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
CALMAR
Échantillon de données pour test
# Une première étape sera de mettre la base sous format Y=f(X) avec X le RFR et Y nos données d'intérêt# On initialise nos données:
# sample=erfs, X=RFR, d=wprm nos poids de sondage et Y notre variable d'intéret
sample = pd.DataFrame(
[
[1, 1, 0, 12], # Bucket 0
[2, 1, 0, 32],
[3, 1, 0, 5],
[4, 1, 46, 0], # Bucket 0-100
[5, 1, 99, 4323],
[6, 1, 90, 104],
[7, 1, 250, 102], # Bucket 250-1000
[8, 1, 300, 1253],
[9, 1, 1000, 92],
[10, 1, 21_000, 9217], # Bucket 1000-25000
[11, 1, 12_000, 91],
[12, 1, 1000, 0],
[13, 1, 8000, 0],
[14, 1, 1830, 9812],
[15, 1, 1185, 100281],
[16, 1, 1981, 9822],
[17, 1, 18417, 91],
[18, 1, 26_000, 2301], # Bucket 25000-50000
[19, 1, 49_000, 87203],
],
columns=["idfoy", "d", "X", "Y"],
)
sample.head()| idfoy | d | X | Y | |
|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 12 |
| 1 | 2 | 1 | 0 | 32 |
| 2 | 3 | 1 | 0 | 5 |
| 3 | 4 | 1 | 46 | 0 |
| 4 | 5 | 1 | 99 | 4323 |
estimateur_y = (sample["d"] * sample["Y"]).sum()
print(estimateur_y)224741
Neural Network
Échantillon de données pour test
# Import de la base ERFS et sélection d'un échantillon
erfs_03 = pd.read_hdf(
config.get("DATA_OUT") + "03_erfs_rfr_cal_ind" + config.get("YEAR_ERFS") + ".h5"
)
erfs_03.columns
erfs_03.tail()Index(['activite', 'age', 'categorie_salarie', 'chomage_brut',
'contrat_de_travail', 'date_naissance', 'effectif_entreprise',
'heures_remunerees_volume', 'idfam', 'idfoy', 'idmen', 'noindiv',
'pensions_alimentaires_percues', 'quifam', 'quifoy', 'quimen', 'rag',
'retraite_brute', 'ric', 'rnc', 'statut_marital', 'salaire_de_base',
'idmen_original', 'idfoy_original', 'idfam_original', 'idmen_x', 'wprm',
'zone_apl', 'fake_id', 'f4ba', 'quimenof', 'quifoyof', 'quifamof',
'rfr'],
dtype='object')
| activite | age | categorie_salarie | chomage_brut | contrat_de_travail | date_naissance | effectif_entreprise | heures_remunerees_volume | idfam | idfoy | ... | idfam_original | idmen_x | wprm | zone_apl | fake_id | f4ba | quimenof | quifoyof | quifamof | rfr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 337803 | 0.0 | 51 | 1 | 0 | 0 | 1966-01-14 | 50 | 0.0 | 172206 | 172206 | ... | 1802957401 | 18011370 | 0.668483 | 2 | 1.0 | 4399.0 | personne_de_reference | declarant_principal | demandeur | 3.186688e+07 |
| 337804 | 0.0 | 44 | 1 | 0 | 0 | 1973-01-14 | 500 | 0.0 | 172207 | 172207 | ... | 1801891601 | 18028481 | 0.989284 | 2 | 1.0 | 1500.0 | personne_de_reference | declarant_principal | demandeur | 4.977271e+07 |
| 337805 | 0.0 | 80 | 1 | 0 | 0 | 1937-10-04 | 50 | 0.0 | 172208 | 172208 | ... | 1803209401 | 18019488 | 0.693574 | 2 | 1.0 | 0.0 | personne_de_reference | declarant_principal | demandeur | 9.158978e+07 |
| 337806 | 0.0 | 66 | 0 | 0 | 1 | 1951-01-04 | 0 | 1144.0 | 172209 | 172209 | ... | 1804042701 | 18011185 | 0.885337 | 2 | 1.0 | 139340.0 | personne_de_reference | declarant_principal | demandeur | 6.888963e+07 |
| 337807 | 0.0 | 62 | 0 | 0 | 0 | 1955-04-17 | 500 | 0.0 | 172210 | 172210 | ... | 1803086002 | 18038170 | 0.793936 | 2 | 1.0 | 0.0 | personne_de_reference | declarant_principal | demandeur | 5.150487e+07 |
5 rows × 34 columns
# Notre échantillon pour tests
dataset = erfs_03[
[
"idfoy",
"quifoy",
"wprm",
"salaire_de_base",
"chomage_brut",
"retraite_brute",
"f4ba",
"rfr",
]
]
# Pour tester sur une partie de la base
dataset = erfs_03.sample(n=1000, random_state=53)
print("On a ", len(dataset), "individus dans cet échantillon")
dataset.head()
# Ici, wprm sont les poids initiaux dits "de sondage"On a 337808 individus dans cet échantillon
| idfoy | quifoy | wprm | salaire_de_base | chomage_brut | retraite_brute | f4ba | rfr | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 272.745914 | 0.000000 | 0 | 3560 | 0.0 | 755.000244 |
| 1 | 1 | 0 | 227.157260 | 0.000000 | 0 | 19360 | 0.0 | 43778.000000 |
| 2 | 1 | 1 | 227.157260 | 0.000000 | 0 | 28230 | 0.0 | 0.000000 |
| 3 | 2 | 0 | 194.930798 | 65283.748389 | 0 | 0 | 550.0 | 70192.570312 |
| 4 | 2 | 1 | 194.930798 | 29200.808225 | 0 | 0 | 0.0 | 0.000000 |
# Nos variables auxilliaires sont: le salaire_de_base, le chomage_brut, la retraite_brute et le f4ba (revenu foncier)
J = 4
X_avant = [
(dataset["wprm"] * dataset["salaire_de_base"]).sum(),
(dataset["wprm"] * dataset["chomage_brut"]).sum(),
(dataset["wprm"] * dataset["retraite_brute"]).sum(),
(dataset["wprm"] * dataset["f4ba"]).sum(),
]
X_avant[787347859186.1661, 35973091573.96126, 425467165630.34125, 44012929315.33398]
# Pour calibrer, on introduit les totaux "wanted" ( TODO: à paramétriser)
X = [
650855163531.0, # Salaire imposable POTE 2019
0.97
* (
dataset["wprm"] * dataset["chomage_brut"]
).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer
307,
254,
581,
479.0,
1.34
* (
dataset["wprm"] * dataset["f4ba"]
).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer
]
X--------------------------------------------------------------------------- KeyError Traceback (most recent call last) File ~/.cache/pypoetry/virtualenvs/leximpact_prepare_data-77FW3yLw-py3.8/lib/python3.8/site-packages/pandas/core/indexes/base.py:3361, in Index.get_loc(self, key, method, tolerance) 3360 try: -> 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: File ~/.cache/pypoetry/virtualenvs/leximpact_prepare_data-77FW3yLw-py3.8/lib/python3.8/site-packages/pandas/_libs/index.pyx:76, in pandas._libs.index.IndexEngine.get_loc() File ~/.cache/pypoetry/virtualenvs/leximpact_prepare_data-77FW3yLw-py3.8/lib/python3.8/site-packages/pandas/_libs/index.pyx:108, in pandas._libs.index.IndexEngine.get_loc() File pandas/_libs/hashtable_class_helper.pxi:5198, in pandas._libs.hashtable.PyObjectHashTable.get_item() File pandas/_libs/hashtable_class_helper.pxi:5206, in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'wprm' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) Input In [58], in <module> 1 # Pour calibrer, on introduit les totaux "wanted" ( TODO: à paramétriser) 2 X = [ 3 650855163531.0, # Salaire imposable POTE 2019 4 0.97 5 * ( ----> 6 dataset["wprm"] * dataset["chomage_brut"] 7 ).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer 8 307, 9 254, 10 581, 11 479.0, 12 1.34 13 * ( 14 dataset["wprm"] * dataset["f4ba"] 15 ).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer 16 ] 17 X File ~/.cache/pypoetry/virtualenvs/leximpact_prepare_data-77FW3yLw-py3.8/lib/python3.8/site-packages/pandas/core/frame.py:3458, in DataFrame.__getitem__(self, key) 3456 if self.columns.nlevels > 1: 3457 return self._getitem_multilevel(key) -> 3458 indexer = self.columns.get_loc(key) 3459 if is_integer(indexer): 3460 indexer = [indexer] File ~/.cache/pypoetry/virtualenvs/leximpact_prepare_data-77FW3yLw-py3.8/lib/python3.8/site-packages/pandas/core/indexes/base.py:3363, in Index.get_loc(self, key, method, tolerance) 3361 return self._engine.get_loc(casted_key) 3362 except KeyError as err: -> 3363 raise KeyError(key) from err 3365 if is_scalar(key) and isna(key) and not self.hasnans: 3366 raise KeyError(key) KeyError: 'wprm'
# On cherche Y_final le total estimé de RFR. Pour l'instant (avec les poids de sondage on a:)
Y_initial = (dataset["wprm"] * dataset["salaire_de_base"]).sum()
# On le comparera à
Y_wanted = 1084463009284.0 # RFR POTE 2019
# Soit l'erreur avant calibration:
error_initiale = 100 * abs((Y_initial - Y_wanted) / Y_wanted)
print("Erreur avant calibration: ", error_initiale, " %")Erreur avant calibration: 27.39744440836203 %
Préparation des données
Source: https://www.tensorflow.org/tutorials/keras/regression
# On enlève les poids
poids_de_sondage = dataset.pop("wprm")
dataset| idfoy | quifoy | salaire_de_base | chomage_brut | retraite_brute | f4ba | rfr | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0.000000 | 0 | 3560 | 0.0 | 7.550002e+02 |
| 1 | 1 | 0 | 0.000000 | 0 | 19360 | 0.0 | 4.377800e+04 |
| 2 | 1 | 1 | 0.000000 | 0 | 28230 | 0.0 | 0.000000e+00 |
| 3 | 2 | 0 | 65283.748389 | 0 | 0 | 550.0 | 7.019257e+04 |
| 4 | 2 | 1 | 29200.808225 | 0 | 0 | 0.0 | 0.000000e+00 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 337803 | 172206 | 0 | 103137.884440 | 0 | 0 | 4399.0 | 3.186688e+07 |
| 337804 | 172207 | 0 | 230051.282743 | 0 | 0 | 1500.0 | 4.977271e+07 |
| 337805 | 172208 | 0 | 75753.850777 | 0 | 44950 | 0.0 | 9.158978e+07 |
| 337806 | 172209 | 0 | 94000.069794 | 0 | 89590 | 139340.0 | 6.888963e+07 |
| 337807 | 172210 | 0 | 207422.526941 | 0 | 0 | 0.0 | 5.150487e+07 |
337808 rows × 7 columns
poids_de_sondage.head()0 272.745914
1 227.157260
2 227.157260
3 194.930798
4 194.930798
Name: wprm, dtype: float64
Nettoyer les données
# On vérifie qu'on n'a pas de Nans
tc.assertEqual(dataset.isna().sum().sum(), 0)
# Si c'est le cas, on pourra les enlever:
# dataset = dataset.fillna(0) ou dataset = dataset.dropna(0)Mapping des variables catégorielles
# Déjà fait dans Survey Manager
# dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})Training dataset VS Test dataset
train_dataset = dataset.sample(frac=0.8, random_state=0) # Fixé pour le dev
test_dataset = dataset.drop(train_dataset.index)
train_dataset.head()
test_dataset.head()Inspecter les données
# On utilise le KDE (Kernel Density Estimator) qui donne la distribution d'une single variable ( chom= f(chom)) en courbe plutôt qu'un histogramme
sns.pairplot(
train_dataset[["salaire_de_base", "chomage_brut", "retraite_brute", "rfr"]],
diag_kind="kde",
)
train_dataset.describe().transpose()| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| idfoy | 270246.0 | 85734.177205 | 49489.055170 | 0.0 | 42886.25 | 85680.5 | 128600.750000 | 1.722100e+05 |
| quifoy | 270246.0 | 0.729117 | 0.821988 | 0.0 | 0.00 | 0.0 | 1.000000 | 2.000000e+00 |
| salaire_de_base | 270246.0 | 10208.258504 | 24643.348758 | 0.0 | 0.00 | 0.0 | 14964.297000 | 2.404751e+06 |
| chomage_brut | 270246.0 | 464.692539 | 2550.259738 | 0.0 | 0.00 | 0.0 | 0.000000 | 1.908200e+05 |
| retraite_brute | 270246.0 | 5513.533521 | 11321.682394 | 0.0 | 0.00 | 0.0 | 6700.000000 | 3.202300e+05 |
| f4ba | 270246.0 | 1027.900231 | 35894.491765 | 0.0 | 0.00 | 0.0 | 0.000000 | 3.929577e+06 |
| rfr | 270246.0 | 35649.712161 | 857931.558593 | 0.0 | 0.00 | 0.0 | 21366.865503 | 1.000843e+08 |
Séparer la valeur cible
train_features = train_dataset.copy()
test_features = test_dataset.copy()
# On met de côté la variable qu'on veut calculer
train_labels = train_features.pop("rfr")
test_labels = test_features.pop("rfr")NOTES
- Quand on normalise ici, on va toucher aux Idfoy et Quifoy, qui devraient rester, respectivement, un nombre entier et un binaire
- Pour éviter cela, il me semble qu’il faudrait s’appliquer à ne calculer des variables sur une base foyer uniquement sur la table foyer (auquel cas on se passe de ces 2 colonnes), et des variables en individus sur la base individus
Normalisation (feature scaling)
# On observe les écarts entre les variables
train_dataset.describe().transpose()[["mean", "std"]]| mean | std | |
|---|---|---|
| idfoy | 85734.177205 | 49489.055170 |
| quifoy | 0.729117 | 0.821988 |
| salaire_de_base | 10208.258504 | 24643.348758 |
| chomage_brut | 464.692539 | 2550.259738 |
| retraite_brute | 5513.533521 | 11321.682394 |
| f4ba | 1027.900231 | 35894.491765 |
| rfr | 35649.712161 | 857931.558593 |
# On crée le calque
normalizer = tf.keras.layers.Normalization(axis=-1)
# On l'adapte aux données (mean et std)
normalizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())2022-01-17 21:42:05.364910: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-17 21:42:05.364949: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-17 21:42:05.364982: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (jupyter-hub): /proc/driver/nvidia/version does not exist
2022-01-17 21:42:05.365174: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[[85734.01 0.729 10208.252 464.692 5513.554 1027.902]]
# On peut voir l'effet de la normalization sur notre première ligne:
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print("First example:", first)
print("Normalized:", normalizer(first).numpy())First example: [[155024. 1. 47722.54 0. 0. 0. ]]
Normalized: [[ 1.4 0.33 1.52 -0.18 -0.49 -0.03]]
Régression linéaire
On commence par faire une régression linéaire à variable unique : RFR = f(salaire_de_base)
Pour cela on utilise un modèle séquentiel (tf.lera.Sequential) qui comporte 2 étapes: - la normalisation (tf.keras.layers.Normalization) - la régression linéaire (tf.keras.layers.Dense)
# On se limite à 1 colonne
salaire = np.array(train_features["salaire_de_base"])1 - On prépare le normalizer
salaire_normalizer = layers.Normalization(
input_shape=[
1,
],
axis=None,
)
salaire_normalizer.adapt(salaire)2 - On construit un modèle séquentiel
salaire_model = tf.keras.Sequential([salaire_normalizer, layers.Dense(units=1)])
salaire_model.summary()Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
normalization_1 (Normalizat (None, 1) 3
ion)
dense (Dense) (None, 1) 2
=================================================================
Total params: 5
Trainable params: 2
Non-trainable params: 3
_________________________________________________________________
Prédictions
salaire[:5]
salaire_model.predict(salaire[:5])array([47722.535, 0. , 21578.21 , 0. , 0. ])
array([[ 1.884],
[-0.513],
[ 0.571],
[-0.513],
[-0.513]], dtype=float32)
# Configuring the training procedure [C'EST ICI QU'ON FAIT LES CHOIX DE METHODE DE CALIBRATION]
salaire_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1), # Méthode d'optimisation
loss="mean_absolute_error",
) # Ce qu'on cherche à optimizerTraining
# On entraine le modèle sur 100 epochs
history = salaire_model.fit(
train_features["salaire_de_base"],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2,
)CPU times: user 10min 17s, sys: 1min 8s, total: 11min 25s
Wall time: 6min 23s
# Visualize the model's training progress using the stats stored in the history object:
hist = pd.DataFrame(history.history)
hist["epoch"] = history.epoch
hist.tail()| loss | val_loss | epoch | |
|---|---|---|---|
| 95 | 31708.880859 | 34078.207031 | 95 |
| 96 | 31708.953125 | 34078.210938 | 96 |
| 97 | 31708.863281 | 34078.218750 | 97 |
| 98 | 31708.919922 | 34078.222656 | 98 |
| 99 | 31709.003906 | 34078.222656 | 99 |
# Fonction pour observer l'évolution du coût
def plot_loss(history):
plt.plot(history.history["loss"], label="loss")
plt.plot(history.history["val_loss"], label="val_loss")
minimum = min(min(history.history["loss"]), min(history.history["val_loss"]))
maximum = max(max(history.history["loss"]), max(history.history["val_loss"]))
plt.ylim(minimum, maximum)
plt.xlabel("Epoch")
plt.ylabel("Error [RFR]")
plt.legend()
plt.grid(True)min(history.history["loss"])
plot_loss(history)31708.8203125

# Collect the results on the test set for later
test_results = {}
test_results["salaire_model"] = salaire_model.evaluate(
test_features["salaire_de_base"], test_labels, verbose=0
)# Vu que c'est une régression simple, on peut même plotter la prédiction:
x = tf.linspace(0.0, 250, 251)
y = salaire_model.predict(x)
def plot_salaire(x, y):
plt.scatter(train_features["salaire_de_base"], train_labels, label="Data")
plt.plot(x, y, color="k", label="Predictions")
plt.xlabel("salaire_de_base")
plt.ylabel("RFR")
plt.legend()
plot_salaire(x, y)
Linear regression with multiple input
You can use an almost identical setup to make predictions based on multiple inputs. This model still does the same y = m.x + b except that m is a matrix and b is a vector.
Create a two-step Keras Sequential model again with the first layer being normalizer (tf.keras.layers.Normalization(axis=-1)) you defined earlier and adapted to the whole dataset:
linear_model = tf.keras.Sequential([normalizer, layers.Dense(units=1)])We call Model.predict on a batch of inputs, which produces units=1 outputs for each example:
linear_model.predict(train_features[:10])array([[ 0.272],
[-0.274],
[-0.166],
[ 0.248],
[-0.255],
[ 0.165],
[ 0.151],
[ 0.285],
[-0.179],
[ 0.274]], dtype=float32)
# La matrice des poids (on en a 7 par 1: un poids par colonne par personne)
linear_model.layers[1].kernel<tf.Variable 'dense_1/kernel:0' shape=(6, 1) dtype=float32, numpy=
array([[0.347],
[0.058],
[0.006],
[0.623],
[0.211],
[0.875]], dtype=float32)>
Configuring the model with Keras Model.compile and train with Model.fit for 100 epochs:
linear_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1), loss="mean_absolute_error"
)history = linear_model.fit(
train_features,
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2,
)CPU times: user 10min 32s, sys: 1min 8s, total: 11min 41s
Wall time: 6min 25s
plot_loss(history)
Collecting the results on the test set for later:
test_results["linear_model"] = linear_model.evaluate(
test_features, test_labels, verbose=0
)Utilisation d’un neural network
Source: https://www.tensorflow.org/tutorials/keras/regression
In the previous section, you implemented two linear models for single and multiple inputs.
Here, you will implement single-input and multiple-input DNN models.
The code is basically the same except the model is expanded to include some “hidden” non-linear layers. The name “hidden” here just means not directly connected to the inputs or outputs.
These models will contain a few more layers than the linear model:
The normalization layer, as before (with horsepower_normalizer for a single-input model and normalizer for a multiple-input model). Two hidden, non-linear, Dense layers with the ReLU (relu) activation function nonlinearity. A linear Dense single-output layer. Both models will use the same training procedure so the compile method is included in the build_and_compile_model function below.
def build_and_compile_model(norm):
model = keras.Sequential(
[
norm,
layers.Dense(64, activation="relu"),
layers.Dense(64, activation="relu"),
layers.Dense(1),
]
)
model.compile(loss="mean_absolute_error", optimizer=tf.keras.optimizers.Adam(0.001))
return modelRegression using a DNN and a single input Create a DNN model with only ‘salaire_de_base’ as input and salaire_normalizer (defined earlier) as the normalization layer:
dnn_salaire_model = build_and_compile_model(salaire_normalizer)# This model has quite a few more trainable parameters than the linear models:
dnn_salaire_model.summary()Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
normalization_1 (Normalizat (None, 1) 3
ion)
dense_2 (Dense) (None, 64) 128
dense_3 (Dense) (None, 64) 4160
dense_4 (Dense) (None, 1) 65
=================================================================
Total params: 4,356
Trainable params: 4,353
Non-trainable params: 3
_________________________________________________________________
# Training the model with Keras Model.fit:
history = dnn_salaire_model.fit(
train_features["salaire_de_base"],
train_labels,
validation_split=0.2,
verbose=0,
epochs=100,
)CPU times: user 12min 29s, sys: 1min 37s, total: 14min 6s
Wall time: 7min 19s
plot_loss(history)
# Fonction pour observer l'évolution du coût en fonction de la variable d'entree
x = tf.linspace(0.0, 250, 251)
y = dnn_salaire_model.predict(x)
plot_salaire(x, y)
# Collecting the results on the test set for later:
test_results["dnn_salaire_model"] = dnn_salaire_model.evaluate(
test_features["salaire_de_base"], test_labels, verbose=0
)Regression using a DNN and multiple inputs
Repeat the previous process using all the inputs. The model’s performance slightly improves on the validation dataset.
# Repeating the previous process using all the inputs.
# The model's performance slightly improves on the validation dataset.
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
normalization (Normalizatio (None, 6) 13
n)
dense_5 (Dense) (None, 64) 448
dense_6 (Dense) (None, 64) 4160
dense_7 (Dense) (None, 1) 65
=================================================================
Total params: 4,686
Trainable params: 4,673
Non-trainable params: 13
_________________________________________________________________
history = dnn_model.fit(
train_features, train_labels, validation_split=0.2, verbose=0, epochs=100
)CPU times: user 12min 38s, sys: 1min 34s, total: 14min 13s
Wall time: 7min 20s
plot_loss(history)
test_results["dnn_model"] = dnn_model.evaluate(test_features, test_labels, verbose=0)## Performance Now that all models are trained, let’s review their test set performance
pd.DataFrame(test_results, index=["Mean absolute error [RFR]"]).T| Mean absolute error [RFR] | |
|---|---|
| salaire_model | 34170.281250 |
| linear_model | 31090.541016 |
| dnn_salaire_model | 33908.257812 |
| dnn_model | 26228.125000 |
Making predictions
test_predictions = dnn_model.predict(test_features).flatten()
a = plt.axes(aspect="equal")
plt.scatter(test_labels, test_predictions)
plt.xlabel("True Values [RFR]")
plt.ylabel("Predictions [RFR]")
maximum = max(max(test_labels), max(test_predictions))
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)Text(0.5, 0, 'True Values [RFR]')
Text(0, 0.5, 'Predictions [RFR]')

# Checking the error distribution
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel("Prediction Error [RFR]")
_ = plt.ylabel("Count")(array([ 2., 0., 0., 0., 1., 1., 0., 0.,
1., 0., 1., 0., 0., 0., 0., 0.,
0., 0., 10., 20., 16., 25., 4., 67468.,
13.]),
array([-96741090.784, -92674230.853, -88607370.921, -84540510.99 ,
-80473651.059, -76406791.127, -72339931.196, -68273071.264,
-64206211.333, -60139351.402, -56072491.47 , -52005631.539,
-47938771.608, -43871911.676, -39805051.745, -35738191.814,
-31671331.882, -27604471.951, -23537612.02 , -19470752.088,
-15403892.157, -11337032.225, -7270172.294, -3203312.363,
863547.569, 4930407.5 ]),
<BarContainer object of 25 artists>)
Text(0.5, 0, 'Prediction Error [MPG]')

# Saving the model for later
dnn_model.save("dnn_model")2022-01-17 22:09:49.930273: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
INFO:tensorflow:Assets written to: dnn_model/assets
# We can reload the model and have the exact same results:
reloaded = tf.keras.models.load_model("dnn_model")
test_results["reloaded"] = reloaded.evaluate(test_features, test_labels, verbose=0)Conclusion
We can improve this method by:
Choosing the loss function. Mean squared error (MSE) (tf.losses.MeanSquaredError) and mean absolute error (MAE) (tf.losses.MeanAbsoluteError) are commonly used for regression problems. MAE is less sensitive to outliers. Different loss functions are used for classification problems.
Overfitting is a common problem for DNN models, though it wasn’t a problem for this tutorial. Visit the Overfit and underfit tutorial for more help with this.
Overfit and underfit
Source: https://www.tensorflow.org/tutorials/keras/overfit_and_underfit