--- title: Essais et développement de la méthode de calage sur marges keywords: fastai sidebar: home_sidebar nb_path: "notebooks/calmar/essais_neural_network.ipynb" ---
from leximpact_socio_fisca_simu_etat.config import Configuration
config = Configuration(project_folder="leximpact-prepare-data")
import unittest
tc = unittest.TestCase()
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# sample=erfs, X=RFR, d=wprm nos poids de sondage et Y notre variable d'intéret
sample = pd.DataFrame(
[
[1, 1, 0, 12], # Bucket 0
[2, 1, 0, 32],
[3, 1, 0, 5],
[4, 1, 46, 0], # Bucket 0-100
[5, 1, 99, 4323],
[6, 1, 90, 104],
[7, 1, 250, 102], # Bucket 250-1000
[8, 1, 300, 1253],
[9, 1, 1000, 92],
[10, 1, 21_000, 9217], # Bucket 1000-25000
[11, 1, 12_000, 91],
[12, 1, 1000, 0],
[13, 1, 8000, 0],
[14, 1, 1830, 9812],
[15, 1, 1185, 100281],
[16, 1, 1981, 9822],
[17, 1, 18417, 91],
[18, 1, 26_000, 2301], # Bucket 25000-50000
[19, 1, 49_000, 87203],
],
columns=["idfoy", "d", "X", "Y"],
)
sample.head()
estimateur_y = (sample["d"] * sample["Y"]).sum()
print(estimateur_y)
erfs_03 = pd.read_hdf(
config.get("DATA_OUT") + "03_erfs_rfr_cal_ind" + config.get("YEAR_ERFS") + ".h5"
)
erfs_03.columns
erfs_03.tail()
dataset = erfs_03[
[
"idfoy",
"quifoy",
"wprm",
"salaire_de_base",
"chomage_brut",
"retraite_brute",
"f4ba",
"rfr",
]
]
# Pour tester sur une partie de la base
dataset = erfs_03.sample(n=1000, random_state=53)
print("On a ", len(dataset), "individus dans cet échantillon")
dataset.head()
# Ici, wprm sont les poids initiaux dits "de sondage"
J = 4
X_avant = [
(dataset["wprm"] * dataset["salaire_de_base"]).sum(),
(dataset["wprm"] * dataset["chomage_brut"]).sum(),
(dataset["wprm"] * dataset["retraite_brute"]).sum(),
(dataset["wprm"] * dataset["f4ba"]).sum(),
]
X_avant
X = [
650855163531.0, # Salaire imposable POTE 2019
0.97
* (
dataset["wprm"] * dataset["chomage_brut"]
).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer
307,
254,
581,
479.0,
1.34
* (
dataset["wprm"] * dataset["f4ba"]
).sum(), # On ne connait pas le chiffre, donc on met un ecart pour calibrer
]
X
Y_initial = (dataset["wprm"] * dataset["salaire_de_base"]).sum()
# On le comparera à
Y_wanted = 1084463009284.0 # RFR POTE 2019
# Soit l'erreur avant calibration:
error_initiale = 100 * abs((Y_initial - Y_wanted) / Y_wanted)
print("Erreur avant calibration: ", error_initiale, " %")
Source: https://www.tensorflow.org/tutorials/keras/regression
poids_de_sondage = dataset.pop("wprm")
dataset
poids_de_sondage.head()
tc.assertEqual(dataset.isna().sum().sum(), 0)
# Si c'est le cas, on pourra les enlever:
# dataset = dataset.fillna(0) ou dataset = dataset.dropna(0)
# dataset['Origin'] = dataset['Origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})
train_dataset = dataset.sample(frac=0.8, random_state=0) # Fixé pour le dev
test_dataset = dataset.drop(train_dataset.index)
train_dataset.head()
test_dataset.head()
sns.pairplot(
train_dataset[["salaire_de_base", "chomage_brut", "retraite_brute", "rfr"]],
diag_kind="kde",
)
train_dataset.describe().transpose()
train_features = train_dataset.copy()
test_features = test_dataset.copy()
# On met de côté la variable qu'on veut calculer
train_labels = train_features.pop("rfr")
test_labels = test_features.pop("rfr")
NOTES
train_dataset.describe().transpose()[["mean", "std"]]
normalizer = tf.keras.layers.Normalization(axis=-1)
# On l'adapte aux données (mean et std)
normalizer.adapt(np.array(train_features))
print(normalizer.mean.numpy())
first = np.array(train_features[:1])
with np.printoptions(precision=2, suppress=True):
print("First example:", first)
print("Normalized:", normalizer(first).numpy())
salaire = np.array(train_features["salaire_de_base"])
salaire_normalizer = layers.Normalization(
input_shape=[
1,
],
axis=None,
)
salaire_normalizer.adapt(salaire)
salaire_model = tf.keras.Sequential([salaire_normalizer, layers.Dense(units=1)])
salaire_model.summary()
salaire[:5]
salaire_model.predict(salaire[:5])
salaire_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1), # Méthode d'optimisation
loss="mean_absolute_error",
) # Ce qu'on cherche à optimizer
%%time
# On entraine le modèle sur 100 epochs
history = salaire_model.fit(
train_features["salaire_de_base"],
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2,
)
hist = pd.DataFrame(history.history)
hist["epoch"] = history.epoch
hist.tail()
def plot_loss(history):
plt.plot(history.history["loss"], label="loss")
plt.plot(history.history["val_loss"], label="val_loss")
minimum = min(min(history.history["loss"]), min(history.history["val_loss"]))
maximum = max(max(history.history["loss"]), max(history.history["val_loss"]))
plt.ylim(minimum, maximum)
plt.xlabel("Epoch")
plt.ylabel("Error [RFR]")
plt.legend()
plt.grid(True)
min(history.history["loss"])
plot_loss(history)
test_results = {}
test_results["salaire_model"] = salaire_model.evaluate(
test_features["salaire_de_base"], test_labels, verbose=0
)
x = tf.linspace(0.0, 250, 251)
y = salaire_model.predict(x)
def plot_salaire(x, y):
plt.scatter(train_features["salaire_de_base"], train_labels, label="Data")
plt.plot(x, y, color="k", label="Predictions")
plt.xlabel("salaire_de_base")
plt.ylabel("RFR")
plt.legend()
plot_salaire(x, y)
You can use an almost identical setup to make predictions based on multiple inputs. This model still does the same y = m.x + b except that m is a matrix and b is a vector.
Create a two-step Keras Sequential model again with the first layer being normalizer (tf.keras.layers.Normalization(axis=-1)) you defined earlier and adapted to the whole dataset:
linear_model = tf.keras.Sequential([normalizer, layers.Dense(units=1)])
We call Model.predict on a batch of inputs, which produces units=1 outputs for each example:
linear_model.predict(train_features[:10])
linear_model.layers[1].kernel
Configuring the model with Keras Model.compile and train with Model.fit for 100 epochs:
linear_model.compile(
optimizer=tf.optimizers.Adam(learning_rate=0.1), loss="mean_absolute_error"
)
%%time
history = linear_model.fit(
train_features,
train_labels,
epochs=100,
# Suppress logging.
verbose=0,
# Calculate validation results on 20% of the training data.
validation_split=0.2,
)
plot_loss(history)
Collecting the results on the test set for later:
test_results["linear_model"] = linear_model.evaluate(
test_features, test_labels, verbose=0
)
Source: https://www.tensorflow.org/tutorials/keras/regression
In the previous section, you implemented two linear models for single and multiple inputs.
Here, you will implement single-input and multiple-input DNN models.
The code is basically the same except the model is expanded to include some "hidden" non-linear layers. The name "hidden" here just means not directly connected to the inputs or outputs.
These models will contain a few more layers than the linear model:
The normalization layer, as before (with horsepower_normalizer for a single-input model and normalizer for a multiple-input model). Two hidden, non-linear, Dense layers with the ReLU (relu) activation function nonlinearity. A linear Dense single-output layer. Both models will use the same training procedure so the compile method is included in the build_and_compile_model function below.
def build_and_compile_model(norm):
model = keras.Sequential(
[
norm,
layers.Dense(64, activation="relu"),
layers.Dense(64, activation="relu"),
layers.Dense(1),
]
)
model.compile(loss="mean_absolute_error", optimizer=tf.keras.optimizers.Adam(0.001))
return model
Regression using a DNN and a single input Create a DNN model with only 'salaire_de_base' as input and salaire_normalizer (defined earlier) as the normalization layer:
dnn_salaire_model = build_and_compile_model(salaire_normalizer)
dnn_salaire_model.summary()
%%time
# Training the model with Keras Model.fit:
history = dnn_salaire_model.fit(
train_features["salaire_de_base"],
train_labels,
validation_split=0.2,
verbose=0,
epochs=100,
)
plot_loss(history)
x = tf.linspace(0.0, 250, 251)
y = dnn_salaire_model.predict(x)
plot_salaire(x, y)
test_results["dnn_salaire_model"] = dnn_salaire_model.evaluate(
test_features["salaire_de_base"], test_labels, verbose=0
)
# The model's performance slightly improves on the validation dataset.
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()
%%time
history = dnn_model.fit(
train_features, train_labels, validation_split=0.2, verbose=0, epochs=100
)
plot_loss(history)
test_results["dnn_model"] = dnn_model.evaluate(test_features, test_labels, verbose=0)
pd.DataFrame(test_results, index=["Mean absolute error [RFR]"]).T
test_predictions = dnn_model.predict(test_features).flatten()
a = plt.axes(aspect="equal")
plt.scatter(test_labels, test_predictions)
plt.xlabel("True Values [RFR]")
plt.ylabel("Predictions [RFR]")
maximum = max(max(test_labels), max(test_predictions))
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
error = test_predictions - test_labels
plt.hist(error, bins=25)
plt.xlabel("Prediction Error [RFR]")
_ = plt.ylabel("Count")
dnn_model.save("dnn_model")
reloaded = tf.keras.models.load_model("dnn_model")
test_results["reloaded"] = reloaded.evaluate(test_features, test_labels, verbose=0)
We can improve this method by:
Choosing the loss function. Mean squared error (MSE) (tf.losses.MeanSquaredError) and mean absolute error (MAE) (tf.losses.MeanAbsoluteError) are commonly used for regression problems. MAE is less sensitive to outliers. Different loss functions are used for classification problems.
Overfitting is a common problem for DNN models, though it wasn't a problem for this tutorial. Visit the Overfit and underfit tutorial for more help with this.
Source: https://www.tensorflow.org/tutorials/keras/overfit_and_underfit