Deep Learning with PyTorch Step-by-Step: A Beginner's Guide¶

Chapter 0¶

In [3]:
try:
    import google.colab
    import requests
    url = 'https://raw.githubusercontent.com/dvgodoy/PyTorchStepByStep/master/config.py'
    r = requests.get(url, allow_redirects=True)
    open('config.py', 'wb').write(r.content)    
except ModuleNotFoundError:
    pass

from config import *
config_chapter0()
# This is needed to render the plots in this chapter
from plots.chapter0 import *
In [4]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Visualizing Gradient Descent¶

Model¶

$$ \Large y = b + w x + \epsilon $$

Data Generation¶

Synthetic Data Generation¶

In [5]:
true_b = 1
true_w = 2
N = 100

# Data Generation
np.random.seed(42)
x = np.random.rand(N, 1)
epsilon = (.1 * np.random.randn(N, 1))
y = true_b + true_w * x + epsilon

Train-Validation-Test Split¶

In [10]:
# Shuffles the indices
idx = np.arange(N)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:int(N*.8)]
# Uses the remaining indices for validation
val_idx = idx[int(N*.8):]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]
In [11]:
figure1(x_train, y_train, x_val, y_val)
Out[11]:
(<Figure size 1200x600 with 2 Axes>,
 array([<AxesSubplot: title={'center': 'Generated Data - Train'}, xlabel='x', ylabel='y'>,
        <AxesSubplot: title={'center': 'Generated Data - Validation'}, xlabel='x', ylabel='y'>],
       dtype=object))

Step 0: Random Initialization¶

In [12]:
# Step 0 - Initializes parameters "b" and "w" randomly
np.random.seed(42)
b = np.random.randn(1)
w = np.random.randn(1)

print(b, w)
[0.49671415] [-0.1382643]

Step 1: Compute Model's Predictions¶

In [14]:
# Step 1 - Computes our model's predicted output - forward pass
yhat = b + w * x_train
In [15]:
figure2(x_train, y_train, b, w)
Out[15]:
(<Figure size 600x600 with 1 Axes>, <AxesSubplot: xlabel='x', ylabel='y'>)

Step 2: Compute the Loss¶

$$ \Large \text{error}_i = \hat{y_i} - y_i $$
In [9]:
figure3(x_train, y_train, b, w)
Out[9]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f0bc53b5e90>)
$$ \Large \begin{aligned} \text{MSE} &= \frac{1}{n} \sum_{i=1}^n{\text{error}_i}^2 \\ &= \frac{1}{n} \sum_{i=1}^n{(\hat{y_i} - y_i)}^2 \\ &= \frac{1}{n} \sum_{i=1}^n{(b + w x_i - y_i)}^2 \end{aligned} $$
In [16]:
# Step 2 - Computing the loss
# We are using ALL data points, so this is BATCH gradient
# descent. How wrong is our model? That's the error!
error = (yhat - y_train)

# It is a regression, so it computes mean squared error (MSE)
loss = (error ** 2).mean()
print(loss)
2.7421577700550976

Loss Surface¶

In [17]:
# Reminder:
# true_b = 1
# true_w = 2

# we have to split the ranges in 100 evenly spaced intervals each
b_range = np.linspace(true_b - 3, true_b + 3, 101)
w_range = np.linspace(true_w - 3, true_w + 3, 101)
# meshgrid is a handy function that generates a grid of b and w
# values for all combinations
bs, ws = np.meshgrid(b_range, w_range)
bs.shape, ws.shape
Out[17]:
((101, 101), (101, 101))
In [12]:
bs
Out[12]:
array([[-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ],
       [-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ],
       [-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ],
       ...,
       [-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ],
       [-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ],
       [-2.  , -1.94, -1.88, ...,  3.88,  3.94,  4.  ]])
In [18]:
sample_x = x_train[0]
sample_yhat = bs + ws * sample_x
sample_yhat.shape
Out[18]:
(101, 101)
In [19]:
all_predictions = np.apply_along_axis(
    func1d=lambda x: bs + ws * x, 
    axis=1, 
    arr=x_train
)
all_predictions.shape
Out[19]:
(80, 101, 101)
In [20]:
all_labels = y_train.reshape(-1, 1, 1)
all_labels.shape
Out[20]:
(80, 1, 1)
In [21]:
all_errors = (all_predictions - all_labels)
all_errors.shape
Out[21]:
(80, 101, 101)
In [22]:
all_losses = (all_errors ** 2).mean(axis=0)
all_losses.shape
Out[22]:
(101, 101)
In [18]:
figure4(x_train, y_train, b, w, bs, ws, all_losses)
Out[18]:
(<Figure size 864x432 with 2 Axes>,
 (<matplotlib.axes._subplots.Axes3DSubplot at 0x7f0bc507e410>,
  <matplotlib.axes._subplots.AxesSubplot at 0x7f0bc5306450>))

Cross Sections¶

In [19]:
figure5(x_train, y_train, b, w, bs, ws, all_losses)
Out[19]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2aef710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc4266210>],
       dtype=object))
In [20]:
figure6(x_train, y_train, b, w, bs, ws, all_losses)
Out[20]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc24c8250>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc249b4d0>],
       dtype=object))

Step 3: Compute the Gradients¶

$$ \Large \begin{aligned} \frac{\partial{\text{MSE}}}{\partial{b}} = \frac{\partial{\text{MSE}}}{\partial{\hat{y_i}}} \frac{\partial{\hat{y_i}}}{\partial{b}} &= \frac{1}{n} \sum_{i=1}^n{2(b + w x_i - y_i)} \\ &= 2 \frac{1}{n} \sum_{i=1}^n{(\hat{y_i} - y_i)} \\ \frac{\partial{\text{MSE}}}{\partial{w}} = \frac{\partial{\text{MSE}}}{\partial{\hat{y_i}}} \frac{\partial{\hat{y_i}}}{\partial{w}} &= \frac{1}{n} \sum_{i=1}^n{2(b + w x_i - y_i) x_i} \\ &= 2 \frac{1}{n} \sum_{i=1}^n{x_i (\hat{y_i} - y_i)} \end{aligned} $$
In [23]:
# Step 3 - Computes gradients for both "b" and "w" parameters
b_grad = 2 * error.mean()
w_grad = 2 * (x_train * error).mean()
print(b_grad, w_grad)
-3.044811379650508 -1.8337537171510832

Visualizing the Gradients¶

In [22]:
figure7(b, w, bs, ws, all_losses)
Out[22]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2ba1fd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2bc12d0>],
       dtype=object))
In [23]:
figure8(b, w, bs, ws, all_losses)
Out[23]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2c60650>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2c5a390>],
       dtype=object))

Backpropagation¶

Step 4: Update the Parameters¶

$$ \Large \begin{aligned} b &= b - \eta \frac{\partial{\text{MSE}}}{\partial{b}} \\ w &= w - \eta \frac{\partial{\text{MSE}}}{\partial{w}} \end{aligned} $$
In [24]:
# Sets learning rate - this is "eta" ~ the "n" like Greek letter
lr = 0.1
print(b, w)

# Step 4 - Updates parameters using gradients and the 
# learning rate
b = b - lr * b_grad
w = w - lr * w_grad

print(b, w)
[0.49671415] [-0.1382643]
[0.80119529] [0.04511107]
In [25]:
figure9(x_train, y_train, b, w)
Out[25]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f0bc42c5750>)

Learning Rate¶

In [25]:
manual_grad_b = -2.90
manual_grad_w = -1.79

np.random.seed(42)
b_initial = np.random.randn(1)
w_initial = np.random.randn(1)

Low Learning Rate¶

In [26]:
# Learning rate - greek letter "eta" that looks like an "n"
lr = .2

figure10(b_initial, w_initial, bs, ws, all_losses, manual_grad_b, manual_grad_w, lr)
Out[26]:
(<Figure size 1200x600 with 2 Axes>,
 array([<AxesSubplot: title={'center': 'Fixed: b = 0.52'}, xlabel='w', ylabel='MSE (loss)'>,
        <AxesSubplot: title={'center': 'Fixed: w = -0.16'}, xlabel='b'>],
       dtype=object))

High Learning Rate¶

In [28]:
# Learning rate - greek letter "eta" that looks like an "n"
lr = .8

figure10(b_initial, w_initial, bs, ws, all_losses, manual_grad_b, manual_grad_w, lr)
Out[28]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2539190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2d8b350>],
       dtype=object))

Very High Learning Rate¶

In [29]:
# Learning rate - greek letter "eta" that looks like an "n"
lr = 1.1

figure10(b_initial, w_initial, bs, ws, all_losses, manual_grad_b, manual_grad_w, lr)
Out[29]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc2cdf710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc42c5350>],
       dtype=object))

"Bad" Feature¶

In [30]:
true_b = 1
true_w = 2
N = 100

# Data Generation
np.random.seed(42)

# We divide w by 10
bad_w = true_w / 10
# And multiply x by 10
bad_x = np.random.rand(N, 1) * 10

# So, the net effect on y is zero - it is still
# the same as before
y = true_b + bad_w * bad_x + (.1 * np.random.randn(N, 1))
In [31]:
# Generates train and validation sets
# It uses the same train_idx and val_idx as before,
# but it applies to bad_x
bad_x_train, y_train = bad_x[train_idx], y[train_idx]
bad_x_val, y_val = bad_x[val_idx], y[val_idx]
In [32]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].scatter(x_train, y_train)
ax[0].set_xlabel('x')
ax[0].set_ylabel('y')
ax[0].set_ylim([0, 3.1])
ax[0].set_title('Train - Original')
ax[1].scatter(bad_x_train, y_train, c='k')
ax[1].set_xlabel('x')
ax[1].set_ylabel('y')
ax[1].set_ylim([0, 3.1])
ax[1].set_title('Train - "Bad"')
fig.tight_layout()
In [33]:
# The ranges CHANGED because we are centering at the new minimum, using "bad" data
bad_b_range = np.linspace(-2, 4, 101)
bad_w_range = np.linspace(-2.8, 3.2, 101)
bad_bs, bad_ws = np.meshgrid(bad_b_range, bad_w_range)
In [34]:
figure14(x_train, y_train, b_initial, w_initial, bad_bs, bad_ws, bad_x_train)
Out[34]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc0a10510>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc0a35410>],
       dtype=object))
In [35]:
figure15(x_train, y_train, b_initial, w_initial, bad_bs, bad_ws, bad_x_train)
Out[35]:
(<Figure size 864x432 with 2 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc094a190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f0bc08f35d0>],
       dtype=object))

Scaling / Standardizing / Normalizing¶

$$ \Large \overline{X} = \frac{1}{N}\sum_{i=1}^N{x_i} \\ \Large \sigma(X) = \sqrt{\frac{1}{N}\sum_{i=1}^N{(x_i - \overline{X})^2}} \\ \Large \text{scaled } x_i=\frac{x_i-\overline{X}}{\sigma(X)} $$
In [36]:
scaler = StandardScaler(with_mean=True, with_std=True)
# We use the TRAIN set ONLY to fit the scaler
scaler.fit(x_train)

# Now we can use the already fit scaler to TRANSFORM
# both TRAIN and VALIDATION sets
scaled_x_train = scaler.transform(x_train)
scaled_x_val = scaler.transform(x_val)
In [37]:
fig, ax = plt.subplots(1, 3, figsize=(15, 6))
ax[0].scatter(x_train, y_train, c='b')
ax[0].set_xlabel('x')
ax[0].set_ylabel('y')
ax[0].set_ylim([0, 3.1])
ax[0].set_title('Train - Original')
ax[1].scatter(bad_x_train, y_train, c='k')
ax[1].set_xlabel('x')
ax[1].set_ylabel('y')
ax[1].set_ylim([0, 3.1])
ax[1].set_title('Train - "Bad"')
ax[1].label_outer()
ax[2].scatter(scaled_x_train, y_train, c='g')
ax[2].set_xlabel('x')
ax[2].set_ylabel('y')
ax[2].set_ylim([0, 3.1])
ax[2].set_title('Train - Scaled')
ax[2].label_outer()

fig.tight_layout()
In [38]:
# The ranges CHANGED AGAIN because we are centering at the new minimum, using "scaled" data
scaled_b_range = np.linspace(-1, 5, 101)
scaled_w_range = np.linspace(-2.4, 3.6, 101)
scaled_bs, scaled_ws = np.meshgrid(scaled_b_range, scaled_w_range)
In [39]:
figure17(x_train, y_train, scaled_bs, scaled_ws, bad_x_train, scaled_x_train)
Out[39]:
(<Figure size 1080x432 with 3 Axes>,
 array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fc931c103d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc931bde390>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fc931b90b90>],
       dtype=object))

Step 5: Rinse and Repeat!¶

In [40]:
figure18(x_train, y_train)
Out[40]:
(<Figure size 432x432 with 1 Axes>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7fc931a44c50>)

The Path of Gradient Descent¶

Even though the plots are important to illustrate the paths, the corresponding code is beyond the scope of this chapter.

In [ ]: