Training hybrid models using the Pennylane backend

In this example, we will first train a pure quantum model using PennyLane and PyTorch to classify whether a sentence is about cooking or computing. We will then train a hybrid model that takes in pairs of sentences and determines whether they are talking about the same or different topics.

[1]:
BATCH_SIZE = 10
EPOCHS = 15
SEED = 2
[2]:
import torch
import random
import numpy as np

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

Read in the data and create diagrams

[3]:
def read_data(filename):
    labels, sentences = [], []
    with open(filename) as f:
        for line in f:
            t = float(line[0])
            labels.append([t, 1-t])
            sentences.append(line[1:].strip())
    return labels, sentences


train_labels, train_data = read_data('datasets/mc_train_data.txt')
dev_labels, dev_data = read_data('datasets/mc_dev_data.txt')
test_labels, test_data = read_data('datasets/mc_test_data.txt')
[4]:
from lambeq import BobcatParser

reader = BobcatParser(verbose='text')

raw_train_diagrams = reader.sentences2diagrams(train_data)
raw_dev_diagrams = reader.sentences2diagrams(dev_data)
raw_test_diagrams = reader.sentences2diagrams(test_data)
Tagging sentences.
Parsing tagged sentences.
Turning parse trees to diagrams.
Tagging sentences.
Parsing tagged sentences.
Turning parse trees to diagrams.
Tagging sentences.
Parsing tagged sentences.
Turning parse trees to diagrams.

Remove cups

[5]:
from lambeq import remove_cups

train_diagrams = [remove_cups(diagram) for diagram in raw_train_diagrams]
dev_diagrams = [remove_cups(diagram) for diagram in raw_dev_diagrams]
test_diagrams = [remove_cups(diagram) for diagram in raw_test_diagrams]

train_diagrams[0].draw()
../_images/examples_pennylane_8_0.png

Create DisCoPy circuits

[6]:
from lambeq import AtomicType, IQPAnsatz

ansatz = IQPAnsatz({AtomicType.NOUN: 1, AtomicType.SENTENCE: 1},
                   n_layers=1, n_single_qubit_params=3)

train_circuits = [ansatz(diagram) for diagram in train_diagrams]
dev_circuits =  [ansatz(diagram) for diagram in dev_diagrams]
test_circuits = [ansatz(diagram) for diagram in test_diagrams]

train_circuits[0].draw(figsize=(6, 8))
../_images/examples_pennylane_10_0.png

Create (pure quantum) model and initialise parameters

[7]:
from lambeq import PennyLaneModel

all_circuits = train_circuits + dev_circuits + test_circuits

model = PennyLaneModel.from_diagrams(all_circuits)
model.initialise_weights()

Prepare train dataset

[8]:
from lambeq import Dataset

train_dataset = Dataset(train_circuits,
                        train_labels,
                        batch_size=BATCH_SIZE)

val_dataset = Dataset(dev_circuits, dev_labels)

Training

Using PytorchTrainer

[9]:
def acc(y_hat, y):
    return (torch.argmax(y_hat, dim=1) ==
            torch.argmax(y, dim=1)).sum().item()/len(y)

def loss(y_hat, y):
    return torch.nn.functional.mse_loss(y_hat, y)
[10]:
from lambeq import PytorchTrainer

trainer = PytorchTrainer(
        model=model,
        loss_function=loss,
        optimizer=torch.optim.Adam,
        learning_rate=0.1,
        epochs=EPOCHS,
        evaluate_functions={"acc": acc},
        evaluate_on_train=True,
        use_tensorboard=False,
        verbose='text',
        seed=SEED
    )

trainer.fit(train_dataset, val_dataset)
Epoch 1:   train/loss: 0.1778   valid/loss: 0.1430   train/acc: 0.7286   valid/acc: 0.8000
Epoch 2:   train/loss: 0.1054   valid/loss: 0.1081   train/acc: 0.8857   valid/acc: 0.9000
Epoch 3:   train/loss: 0.0716   valid/loss: 0.0658   train/acc: 0.9286   valid/acc: 0.9000
Epoch 4:   train/loss: 0.0618   valid/loss: 0.0262   train/acc: 0.9429   valid/acc: 0.9667
Epoch 5:   train/loss: 0.0427   valid/loss: 0.0468   train/acc: 0.9714   valid/acc: 0.9333
Epoch 6:   train/loss: 0.0159   valid/loss: 0.0261   train/acc: 0.9714   valid/acc: 0.9667
Epoch 7:   train/loss: 0.0212   valid/loss: 0.0130   train/acc: 0.9857   valid/acc: 1.0000
Epoch 8:   train/loss: 0.0192   valid/loss: 0.0246   train/acc: 0.9714   valid/acc: 1.0000
Epoch 9:   train/loss: 0.0804   valid/loss: 0.1782   train/acc: 0.9143   valid/acc: 0.6667
Epoch 10:  train/loss: 0.0386   valid/loss: 0.1091   train/acc: 0.9286   valid/acc: 0.8667
Epoch 11:  train/loss: 0.0384   valid/loss: 0.0877   train/acc: 0.9571   valid/acc: 0.8667
Epoch 12:  train/loss: 0.0226   valid/loss: 0.0855   train/acc: 0.9857   valid/acc: 0.8667
Epoch 13:  train/loss: 0.0211   valid/loss: 0.0244   train/acc: 0.9714   valid/acc: 0.9333
Epoch 14:  train/loss: 0.0121   valid/loss: 0.0616   train/acc: 1.0000   valid/acc: 0.9333
Epoch 15:  train/loss: 0.0456   valid/loss: 0.0387   train/acc: 0.9571   valid/acc: 1.0000

Training completed!

Determine test accuracy

[11]:
def accuracy(circs, labels):
    probs = model(circs)
    return (torch.argmax(probs, dim=1) ==
            torch.argmax(torch.tensor(labels), dim=1)).sum().item()/len(circs)

accuracy(test_circuits, test_labels)
[11]:
0.9

Using standard PyTorch

As we have a small dataset, we can use early stopping to prevent overfitting to the training data.

[12]:
def accuracy(circs, labels):
    probs = model(circs)
    return (torch.argmax(probs, dim=1) ==
            torch.argmax(torch.tensor(labels), dim=1)).sum().item()/len(circs)
[13]:
import pickle

model = PennyLaneModel.from_diagrams(all_circuits)
model.initialise_weights()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

best = {'acc': 0, 'epoch': 0}

for i in range(EPOCHS):
    epoch_loss = 0
    for circuits, labels in train_dataset:
        optimizer.zero_grad()
        probs = model(circuits)
        d_type = model.weights[0].dtype
        probs = probs.to(d_type)
        loss = torch.nn.functional.mse_loss(probs,
                                            torch.tensor(labels))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    if i % 5 == 0:
        dev_acc = accuracy(dev_circuits, dev_labels)

        print("Epoch: {}".format(i))
        print("Train loss: {}".format(epoch_loss))
        print("Dev acc: {}".format(dev_acc))

        if dev_acc > best['acc']:
            best['acc'] = dev_acc
            best['epoch'] = i
            checkpoint = model._make_checkpoint()
            with open("model.lt", "wb+") as f:
                pickle.dump(checkpoint, f)
        elif i - best['epoch'] >= 10:
            print("Early stopping")
            with open("model.lt", "rb") as f:
                checkpoint = pickle.load(f)
            model = PennyLaneModel._load_checkpoint(checkpoint)
            break
Epoch: 0
Train loss: 1.61824881285429
Dev acc: 0.6333333333333333
Epoch: 5
Train loss: 0.4483018293976784
Dev acc: 0.8666666666666667
Epoch: 10
Train loss: 0.07727803220041096
Dev acc: 0.9333333333333333

Determine the test accuracy

[14]:
accuracy(test_circuits, test_labels)
[14]:
0.9666666666666667

Creating a hybrid model

This model will take in pairs of diagrams and attempt to determine whether they are talking about the same or different topics. It does this by first running the circuits to get a probability ouput on the open wire, and then passes this output to a simple neural network. We expect the circuits to learn to output [0, 1] or [1, 0] depending on the topic they are referring to (cooking or computing), and the neural network to learn to XOR these outputs to determine whether the topics are the same (in which case it should ouput 0) or different (in which case it should output 1). PennyLane allows us to train both the circuits and the NN simultaneously using PyTorch autograd.

[15]:
BATCH_SIZE = 50
EPOCHS = 100
SEED = 2

As the probability outputs from our circuits are guaranteed to be positive, we transform these outputs x by 2 * (x - 0.5), giving inputs to the neural network in the range [-1, 1]. This helps us to avoid “dying ReLUs”, which could otherwise occur if all the input weights to a given neuron were negative, leading to the gradient of all these weights being 0. (A couple of alternative approaches could also involve initialising all the neural network weights to be positive, or using LeakyReLU as the activation function).

[16]:
from torch import nn

class XORSentenceModel(PennyLaneModel):
    def __init__(self, **kwargs):
        PennyLaneModel.__init__(self, **kwargs)

        self.xor_net = nn.Sequential(
            nn.Linear(4, 10),
            nn.ReLU(),
            nn.Linear(10, 1),
            nn.Sigmoid()
            )

    def forward(self, diagram_pairs):
        a, b = zip(*diagram_pairs)
        evaluated_pairs = torch.cat((self.get_diagram_output(a),
                                     self.get_diagram_output(b)),
                                    dim=1)
        evaluated_pairs = 2 * (evaluated_pairs - 0.5)
        out = self.xor_net(evaluated_pairs)
        return out

Make paired dataset

[17]:
from itertools import combinations

def make_pair_data(diagrams, labels):
    pair_diags = list(combinations(diagrams, 2))
    pair_labels = [int(x[0] == y[0]) for x, y in combinations(labels, 2)]

    return pair_diags, pair_labels

train_pair_circuits, train_pair_labels = make_pair_data(train_circuits,
                                                        train_labels)
dev_pair_circuits, dev_pair_labels = make_pair_data(dev_circuits, dev_labels)
test_pair_circuits, test_pair_labels = make_pair_data(test_circuits,
                                                      test_labels)

There are lots of pairs (2415 train pairs), so we’ll sample a subset to make this example train more quickly.

[18]:
TRAIN_SAMPLES, DEV_SAMPLES, TEST_SAMPLES = 300, 200, 200
[19]:
train_pair_circuits, train_pair_labels = (
    zip(*random.sample(list(zip(train_pair_circuits, train_pair_labels)),
                       TRAIN_SAMPLES)))
dev_pair_circuits, dev_pair_labels = (
    zip(*random.sample(list(zip(dev_pair_circuits, dev_pair_labels)), DEV_SAMPLES)))
test_pair_circuits, test_pair_labels = (
    zip(*random.sample(list(zip(test_pair_circuits, test_pair_labels)), TEST_SAMPLES)))

Initialise the model

[20]:
all_pair_circuits = (train_pair_circuits +
                     dev_pair_circuits +
                     test_pair_circuits)
a, b = zip(*all_pair_circuits)

model = XORSentenceModel.from_diagrams(a + b)
model.initialise_weights()
model = model.double()

train_pair_dataset = Dataset(train_pair_circuits,
                             train_pair_labels,
                             batch_size=BATCH_SIZE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

Train the model and log accuracies

Only log every five epochs as evaluating is expensive.

[21]:
def accuracy(circs, labels):
    predicted = model(circs)
    return (torch.round(torch.flatten(predicted)) ==
            torch.DoubleTensor(labels)).sum().item()/len(circs)
[22]:
best = {'acc': 0, 'epoch': 0}

for i in range(EPOCHS):
    epoch_loss = 0
    for circuits, labels in train_pair_dataset:
        optimizer.zero_grad()
        predicted = model(circuits)
        loss = torch.nn.functional.binary_cross_entropy(
            torch.flatten(predicted), torch.DoubleTensor(labels))
        epoch_loss += loss.item()
        loss.backward()
        optimizer.step()

    if i % 5 == 0:
        dev_acc = accuracy(dev_pair_circuits, dev_pair_labels)

        print("Epoch: {}".format(i))
        print("Train loss: {}".format(epoch_loss))
        print("Dev acc: {}".format(dev_acc))

        if dev_acc > best['acc']:
            best['acc'] = dev_acc
            best['epoch'] = i
            model.save("xor_model.lt")
        elif i - best['epoch'] >= 10:
            print("Early stopping")
            model.load("xor_model.lt")
            model = model.double()
            break
Epoch: 0
Train loss: 4.202134637529463
Dev acc: 0.445
Epoch: 5
Train loss: 4.152931421145276
Dev acc: 0.54
Epoch: 10
Train loss: 3.9137668866046615
Dev acc: 0.62
Epoch: 15
Train loss: 3.4010897127113733
Dev acc: 0.655
Epoch: 20
Train loss: 2.41645528073079
Dev acc: 0.825
Epoch: 25
Train loss: 2.5319179155031915
Dev acc: 0.885
Epoch: 30
Train loss: 0.2144302081166329
Dev acc: 0.995
Epoch: 35
Train loss: 0.07569202568470873
Dev acc: 1.0
Epoch: 40
Train loss: 0.04757411482113485
Dev acc: 1.0
Epoch: 45
Train loss: 0.033591968613830424
Dev acc: 1.0
Early stopping
[23]:
accuracy(test_pair_circuits, test_pair_labels)
[23]:
1.0