Scalable GP Classification (w/ KISS-GP)

This example shows how to use a GridInterpolationVariationalStrategy module. This classification module is designed for when the function you’re modeling has 2-3 dimensional inputs and you don’t believe that the output can be additively decomposed.

In this example, the function is checkerboard of 1/3x1/3 squares with labels of -1 or 1

Here we use KISS-GP (https://arxiv.org/pdf/1503.01057.pdf) to classify

[1]:
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
from math import exp

%matplotlib inline
%load_ext autoreload
%autoreload 2
[2]:
# We make an nxn grid of training points
# In [0,1]x[0,1] spaced every 1/(n-1)
n = 30
train_x = torch.zeros(int(pow(n, 2)), 2)
train_y = torch.zeros(int(pow(n, 2)))
for i in range(n):
    for j in range(n):
        train_x[i * n + j][0] = float(i) / (n - 1)
        train_x[i * n + j][1] = float(j) / (n - 1)
        # True function is checkerboard of 1/3x1/3 squares with labels of -1 or 1
        train_y[i * n + j] = pow(-1, int(3 * i / n + int(3 * j / n))) + 1 // 2
[3]:
from gpytorch.models import AbstractVariationalGP
from gpytorch.variational import CholeskyVariationalDistribution
from gpytorch.variational import GridInterpolationVariationalStrategy

class GPClassificationModel(AbstractVariationalGP):
    def __init__(self, grid_size=10, grid_bounds=[(0, 1), (0, 1)]):
        variational_distribution = CholeskyVariationalDistribution(int(pow(grid_size, len(grid_bounds))))
        variational_strategy = GridInterpolationVariationalStrategy(self, grid_size, grid_bounds, variational_distribution)
        super(GPClassificationModel, self).__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(
            gpytorch.kernels.RBFKernel(
                lengthscale_prior=gpytorch.priors.SmoothedBoxPrior(
                    exp(0), exp(3), sigma=0.1, transform=torch.exp
                )
            )
        )

    def forward(self,x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        latent_pred = gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
        return latent_pred


model = GPClassificationModel()
likelihood = gpytorch.likelihoods.BernoulliLikelihood()
[4]:
from gpytorch.mlls.variational_elbo import VariationalELBO

# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# "Loss" for GPs - the marginal log likelihood
# n_data refers to the amount of training data
mll = VariationalELBO(likelihood, model, train_y.numel())

def train():
    num_iter = 50
    for i in range(num_iter):
        optimizer.zero_grad()
        output = model(train_x)
        # Calc loss and backprop gradients
        loss = -mll(output, train_y)
        loss.backward()
        print('Iter %d/%d - Loss: %.3f' % (i + 1, num_iter, loss.item()))
        optimizer.step()

# Get clock time
%time train()
Iter 1/50 - Loss: 0.912
Iter 2/50 - Loss: 9.193
Iter 3/50 - Loss: 4.586
Iter 4/50 - Loss: 5.161
Iter 5/50 - Loss: 5.233
Iter 6/50 - Loss: 3.804
Iter 7/50 - Loss: 2.513
Iter 8/50 - Loss: 2.269
Iter 9/50 - Loss: 2.510
Iter 10/50 - Loss: 2.387
Iter 11/50 - Loss: 1.988
Iter 12/50 - Loss: 1.673
Iter 13/50 - Loss: 1.548
Iter 14/50 - Loss: 1.498
Iter 15/50 - Loss: 1.391
Iter 16/50 - Loss: 1.222
Iter 17/50 - Loss: 1.086
Iter 18/50 - Loss: 1.007
Iter 19/50 - Loss: 0.956
Iter 20/50 - Loss: 0.911
Iter 21/50 - Loss: 0.839
Iter 22/50 - Loss: 0.808
Iter 23/50 - Loss: 0.788
Iter 24/50 - Loss: 0.779
Iter 25/50 - Loss: 0.756
Iter 26/50 - Loss: 0.739
Iter 27/50 - Loss: 0.725
Iter 28/50 - Loss: 0.715
Iter 29/50 - Loss: 0.690
Iter 30/50 - Loss: 0.679
Iter 31/50 - Loss: 0.661
Iter 32/50 - Loss: 0.644
Iter 33/50 - Loss: 0.639
Iter 34/50 - Loss: 0.614
Iter 35/50 - Loss: 0.597
Iter 36/50 - Loss: 0.579
Iter 37/50 - Loss: 0.576
Iter 38/50 - Loss: 0.559
Iter 39/50 - Loss: 0.542
Iter 40/50 - Loss: 0.529
Iter 41/50 - Loss: 0.521
Iter 42/50 - Loss: 0.504
Iter 43/50 - Loss: 0.501
Iter 44/50 - Loss: 0.483
Iter 45/50 - Loss: 0.466
Iter 46/50 - Loss: 0.458
Iter 47/50 - Loss: 0.454
Iter 48/50 - Loss: 0.444
Iter 49/50 - Loss: 0.440
Iter 50/50 - Loss: 0.427
CPU times: user 11.8 s, sys: 21.5 s, total: 33.3 s
Wall time: 6.75 s
[5]:
# Set model and likelihood into eval mode
model.eval()
likelihood.eval()

# Initialize figiure an axis
f, ax = plt.subplots(1, 1, figsize=(4, 3))

n = 100
test_x = torch.zeros(int(pow(n, 2)), 2)
for i in range(n):
    for j in range(n):
        test_x[i * n + j][0] = float(i) / (n-1)
        test_x[i * n + j][1] = float(j) / (n-1)

with torch.no_grad(), gpytorch.settings.fast_pred_var():
    predictions = likelihood(model(test_x))

# prob<0.5 --> label -1 // prob>0.5 --> label 1
pred_labels = predictions.mean.ge(0.5).float().numpy()
# Colors = yellow for 1, red for -1
color = []
for i in range(len(pred_labels)):
    if pred_labels[i] == 1:
        color.append('y')
    else:
        color.append('r')

# Plot data a scatter plot
ax.scatter(test_x[:, 0], test_x[:, 1], color=color, s=1)
ax.set_ylim([-0.5, 1.5])
[5]:
(-0.5, 1.5)
../../_images/examples_07_Scalable_GP_Classification_Multidimensional_KISSGP_Kronecker_Classification_5_1.png
[ ]: