Hello,
I created a QNLP transfer learning program using Pytorch and DistilBERT, and it runs correctly when I run it classically on a local simulator. However, I cannot get it to run on either IBMQ or AWS Braket. I have been trying to diagnose and fix this issue for over a month, and just can’t figure out what is incorrect. The issue is that when I try to run on IBMQ, it doesn’t run on the IBMQ network and just runs locally, despite what I set the IBMQ device to. It runs, creates results but no log on the backend of the service. As for when I run this code on AWS Braket, it holds forever on the training phase (line starting runner.train()) and doesn’t show up in the Braket “tasks” log. It doesn’t work on quantum hardware or in the simulator.
Here is a link to the data: https://github.com/dancbeaulieu/publicshare/blob/main/sdg_classification.zip
Here is my code:
#!/usr/bin/env python
coding: utf-8
path_to_data= ‘data/sdg_classification’
train_filename= ‘train_set_sdg_1_7_8_12_13_toy.csv’
validation_filename= ‘val_set_sdg_1_7_8_12_13_toy.csv’
test_filename= ‘eval_set_sdg_1_7_8_12_13_curated_journals_toy.csv’
text_field_name= ‘title_keywords_abstract’
label_field_name= ‘sdg_id’
path_to_test_pred_scores= ‘data/output/pred.txt’
#model:
model_name=‘distilbert-base-uncased’ # pretrained model from Transformers
max_seq_length= 256 # depends on your available GPU memory (in combination with batch size)
num_classes= 5
n_in = 5 #input into quantum layer
#training:
learn_rate= 3e-5 # learning rate is typically ~1e-5 for transformers
num_epochs= 2 # smth around 2-6 epochs is typically fine when finetuning transformers
accum_steps= 4 # one optimization step for that many backward passes
batch_size= 8 # depends on your available GPU memory (in combination with max seq length)
log_dir=‘logdir’ # for training logs and tensorboard visualizations
fp16_params= None # fp16 support
q_depth = 2 # Depth of the quantum circuit (number of variational layers)
n_qubits = 2 # Number of qubits
step = 0.0004 # Learning rate
import time
#general:
seed=17 # random seed for reproducibility
gamma_lr_scheduler = 0.1 # Learning rate reduction applied every 10 epochs.
q_delta = 0.01 # Initial spread of random quantum weights
start_time = time.time() # Start of the computation timer
train_samples = 100
test_samples = 50
#data_dir = “texts/.arxiv/”
validation_split = .2
shuffle_dataset = True
import logging
from pathlib import Path
from typing import List, Mapping, Tuple
PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
#import torchvision
#from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch
from transformers import AutoTokenizer
#from catalyst.utils import set_global_seed
import torch.nn as nn
from transformers import AutoConfig, AutoModel, AutoTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, Trainer, TrainingArguments
from pathlib import Path
from torch.optim import lr_scheduler
from torch.utils.data.sampler import SubsetRandomSampler
import torch.optim as optim
import numpy as np
import torch
import yaml
#from catalyst.dl import SupervisedRunner
import pennylane as qml
import copy
import qiskit
from qiskit import IBMQ
#Catalyst version 21.X breaks all 20.0 imports
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
AccuracyCallback,
CheckpointCallback,
InferCallback,
OptimizerCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed
#FOR CONNECTING TO IBMQ, REPLACED HUB, GROUP & PROJECT WITH FAKE INFO
#token = ‘FAKE TOKEN’
#IBMQ.save_account(token)
#IBMQ.load_account()
#provider = IBMQ.get_provider(hub=‘HUB’, group=‘GROUP’, project=‘PROJECT’)
#dev = qml.device(“qiskit.ibmq”, wires=n_qubits, backend=“ibmq_bogota”, provider=provider)
#dev = qml.device(“default.qubit”, wires=n_qubits)
#device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
Please enter the S3 bucket you created during onboarding
(or any other S3 bucket starting with ‘amazon-braket-’ in your account) in the code below
#CHANGED BUCKET NAME TO PROTECT MY INFORMATION
my_bucket = “braket-bucket-fake” # the name of the bucket
my_prefix = “braket” # the name of the folder in the bucket
s3_folder = (my_bucket, my_prefix)
#Pennylane Hardware config
#device_arn = “arn:aws:braket:::device/quantum-simulator/amazon/sv1”
#device_arn = “arn:aws:braket:::device/qpu/ionq/ionQdevice”
device_arn = “arn:aws:braket:::device/qpu/rigetti/Aspen-9”
dev_remote = qml.device(
“braket.aws.qubit”,
device_arn=device_arn,
wires=n_qubits,
s3_destination_folder=s3_folder,
parallel=True,
)
device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
def get_project_root() -> Path:
return Path(file).parent.parent
#data
class TextClassificationDataset(Dataset):
“”"
Wrapper around Torch Dataset to perform text classification
“”"
def __init__(
self,
texts: List[str],
labels: List[str] = None,
label_dict: Mapping[str, int] = None,
max_seq_length: int = 512,
model_name: str = "distilbert-base-uncased",
):
"""
Args:
texts (List[str]): a list with texts to classify or to train the
classifier on
labels List[str]: a list with classification labels (optional)
label_dict (dict): a dictionary mapping class names to class ids,
to be passed to the validation data (optional)
max_seq_length (int): maximal sequence length in tokens,
texts will be stripped to this length
model_name (str): transformer model name, needed to perform
appropriate tokenization
"""
self.texts = texts
self.labels = labels
self.label_dict = label_dict
self.max_seq_length = max_seq_length
if self.label_dict is None and labels is not None:
# {'class1': 0, 'class2': 1, 'class3': 2, ...}
# using this instead of `sklearn.preprocessing.LabelEncoder`
# no easily handle unknown target values
self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# suppresses tokenizer warnings
logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)
# special tokens for transformers
# in the simplest case a [CLS] token is added in the beginning
# and [SEP] token is added in the end of a piece of text
# [CLS] <indexes text tokens> [SEP] .. <[PAD]>
self.sep_vid = self.tokenizer.vocab["[SEP]"]
self.cls_vid = self.tokenizer.vocab["[CLS]"]
self.pad_vid = self.tokenizer.vocab["[PAD]"]
def __len__(self) -> int:
"""
Returns:
int: length of the dataset
"""
return len(self.texts)
def __getitem__(self, index) -> Mapping[str, torch.Tensor]:
"""Gets element of the dataset
Args:
index (int): index of the element in the dataset
Returns:
Single element by index
"""
# encoding the text
x = self.texts[index]
# a dictionary with `input_ids` and `attention_mask` as keys
output_dict = self.tokenizer.encode_plus(
x,
add_special_tokens=True,
padding="max_length",
max_length=self.max_seq_length,
return_tensors="pt",
truncation=True,
return_attention_mask=True,
)
# for Catalyst, there needs to be a key called features
output_dict["features"] = output_dict["input_ids"].squeeze(0)
del output_dict["input_ids"]
# encoding target
if self.labels is not None:
y = self.labels[index]
y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
output_dict["targets"] = y_encoded
return output_dict
def read_data(params: dict) -> Tuple[dict, dict]:
“”"
A custom function that reads data from CSV files, creates PyTorch datasets and
data loaders. The output is provided to be easily used with Catalyst
:param params: a dictionary read from the config.yml file
:return: a tuple with 2 dictionaries
"""
# reading CSV files to Pandas dataframes
train_df = pd.read_csv(
Path(params["data"]["path_to_data"]) / params["data"]["train_filename"]
)
valid_df = pd.read_csv(
Path(params["data"]["path_to_data"]) / params["data"]["validation_filename"]
)
test_df = pd.read_csv(
Path(params["data"]["path_to_data"]) / params["data"]["test_filename"]
)
# creating PyTorch Datasets
train_dataset = TextClassificationDataset(
texts=train_df[params["data"]["text_field_name"]].values.tolist(),
labels=train_df[params["data"]["label_field_name"]].values,
max_seq_length=params["model"]["max_seq_length"],
model_name=params["model"]["model_name"],
)
valid_dataset = TextClassificationDataset(
texts=valid_df[params["data"]["text_field_name"]].values.tolist(),
labels=valid_df[params["data"]["label_field_name"]].values,
max_seq_length=params["model"]["max_seq_length"],
model_name=params["model"]["model_name"],
)
test_dataset = TextClassificationDataset(
texts=test_df[params["data"]["text_field_name"]].values.tolist(),
labels=test_df[params["data"]["label_field_name"]].values,
max_seq_length=params["model"]["max_seq_length"],
model_name=params["model"]["model_name"],
)
set_global_seed(params["general"]["seed"])
# creating PyTorch data loaders and placing them in dictionaries (for Catalyst)
train_val_loaders = {
"train": DataLoader(
dataset=train_dataset,
batch_size=params["training"]["batch_size"],
shuffle=True,
),
"valid": DataLoader(
dataset=valid_dataset,
batch_size=params["training"]["batch_size"],
shuffle=False,
),
}
test_loaders = {
"test": DataLoader(
dataset=test_dataset,
batch_size=params["training"]["batch_size"],
shuffle=False,
)
}
return train_val_loaders, test_loaders, train_dataset, valid_dataset, test_dataset
class BertForSequenceClassification(nn.Module):
“”"
Simplified version of the same class by HuggingFace.
See transformers/modeling_distilbert.py in the transformers repository.
“”"
def __init__(
self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.3
):
"""
Args:
pretrained_model_name (str): HuggingFace model name.
See transformers/modeling_auto.py
num_classes (int): the number of class labels
in the classification task
"""
super().__init__()
config = AutoConfig.from_pretrained(
pretrained_model_name, num_labels=num_classes
)
self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
#for param in self.parameters():
# param.requires_grad = False
self.classifier = nn.Linear(config.hidden_size, num_classes)
self.dropout = nn.Dropout(dropout)
def forward(self, features, attention_mask=None, head_mask=None):
"""Compute class probabilities for the input sequence.
Args:
features (torch.Tensor): ids of each token,
size ([bs, seq_length]
attention_mask (torch.Tensor): binary tensor, used to select
tokens which are used to compute attention scores
in the self-attention heads, size [bs, seq_length]
head_mask (torch.Tensor): 1.0 in head_mask indicates that
we keep the head, size: [num_heads]
or [num_hidden_layers x num_heads]
Returns:
PyTorch Tensor with predicted class scores
"""
assert attention_mask is not None, "attention mask is none"
# taking BERTModel output
# see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
bert_output = self.model(
input_ids=features, attention_mask=attention_mask, head_mask=head_mask
)
# we only need the hidden state here and don't need
# transformer output, so index 0
seq_output = bert_output[0] # (bs, seq_len, dim)
# mean pooling, i.e. getting average representation of all tokens
pooled_output = seq_output.mean(axis=1) # (bs, dim)
pooled_output = self.dropout(pooled_output) # (bs, dim)
scores = self.classifier(pooled_output) # (bs, num_classes)
return scores
with open(str(“config.yml”)) as f:
params = yaml.load(f, Loader=yaml.FullLoader)
def H_layer(nqubits):
“”“Layer of single-qubit Hadamard gates.
“””
for idx in range(nqubits):
qml.Hadamard(wires=idx)
def RY_layer(w):
“”“Layer of parametrized qubit rotations around the y axis.
“””
for idx, element in enumerate(w):
#print(‘w in RY_layer’, type(w))
qml.RY(element, wires=idx)
def RX_layer(w):
“”“Layer of parametrized qubit rotations around the X axis.
“””
for idx, element in enumerate(w):
#print(‘w in RY_layer’, type(w))
qml.RX(element, wires=idx)
def RZ_layer(w):
“”“Layer of parametrized qubit rotations around the Z axis.
“””
for idx, element in enumerate(w):
#print(‘w in RY_layer’, type(w))
qml.RZ(element, wires=idx)
def entangling_layer(nqubits):
“”“Layer of CNOTs followed by another shifted layer of CNOT.
“””
# In other words it should apply something like :
# CNOT CNOT CNOT CNOT… CNOT
# CNOT CNOT CNOT… CNOT
for i in range(0, nqubits - 1, 2): # Loop over even indices: i=0,2,…N-2
qml.CNOT(wires=[i, i + 1])
for i in range(1, nqubits - 1, 2): # Loop over odd indices: i=1,3,…N-3
qml.CNOT(wires=[i, i + 1])
def entangling_layer_alt(nqubits):
“”“Layer of CNOTs followed by another shifted layer of CNOT.
“””
# In other words it should apply something like :
# CNOT CNOT CNOT CNOT… CNOT
# CNOT CNOT CNOT… CNOT
for i in range(0, nqubits - 1, 2): # Loop over even indices: i=0,2,…N-2
qml.CNOT(wires=[i, i + 1])
for i in range(1, nqubits - 1, 2): # Loop over odd indices: i=1,3,…N-3
qml.CNOT(wires=[i, i + 1])
#Original version of quantum circuit used in Mari et al 2019
@qml.qnode(dev_remote, interface=“torch”)
def quantum_net(q_input_features, q_weights_flat):
“”"
The variational quantum circuit.
“”"
# Reshape weights
q_weights = q_weights_flat.reshape(q_depth, n_qubits)
# Start from state |+> , unbiased w.r.t. |0> and |1>
H_layer(n_qubits)
# Embed features in the quantum node
RY_layer(q_input_features)
# Sequence of trainable variational layers
for k in range(q_depth):
entangling_layer(n_qubits)
RY_layer(q_weights[k])
# Expectation values in the Z basis
exp_vals = [qml.expval(qml.PauliZ(position)) for position in range(n_qubits)]
return tuple(exp_vals)
read and process data
train_val_loaders, test_loaders, train_dataset, valid_dataset, test_dataset = read_data(params)
reproducibility
set_global_seed(seed)
prepare_cudnn(deterministic=True)
class DressedQuantumNet(nn.Module):
“”"
Torch module implementing the dressed quantum net.
“”"
def __init__(self):
"""
Definition of the *dressed* layout.
"""
super().__init__()
self.pre_net = nn.Linear(num_classes, n_qubits)
self.q_params = nn.Parameter(q_delta * torch.randn(q_depth * n_qubits))
self.post_net = nn.Linear(n_qubits, 2)
def forward(self, input_features):
"""
Defining how tensors are supposed to move through the *dressed* quantum
net.
"""
# obtain the input features for the quantum circuit
# by reducing the feature dimension from 512 to 4
pre_out = self.pre_net(input_features)
q_in = torch.tanh(pre_out) * np.pi / 2.0
# Apply the quantum circuit to each element of the batch and append to q_out
q_out = torch.Tensor(0, n_qubits)
q_out = q_out.to(device)
for elem in q_in:
q_out_elem = quantum_net(elem, self.q_params).float().unsqueeze(0)
#quantum_net(elem, self.q_params).float().unsqueeze(0)
q_out = torch.cat((q_out, q_out_elem))
# return the two-dimensional prediction from the postprocessing layer
return self.post_net(q_out)
class BertForSequenceClassification_qnlp(nn.Module):
“”"
Simplified version of the same class by HuggingFace.
See transformers/modeling_distilbert.py in the transformers repository.
“”"
def __init__(
self, pretrained_model_name: str, num_classes: int = None, dropout: float = 0.3
):
"""
Args:
pretrained_model_name (str): HuggingFace model name.
See transformers/modeling_auto.py
num_classes (int): the number of class labels
in the classification task
"""
super().__init__()
config = AutoConfig.from_pretrained(
pretrained_model_name, num_labels=num_classes
)
self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
#for param in self.parameters():
# param.requires_grad = False
self.classifier = nn.Linear(config.hidden_size, num_classes)
self.linear = DressedQuantumNet()
self.dropout = nn.Dropout(dropout)
def forward(self, features, attention_mask=None, head_mask=None):
"""Compute class probabilities for the input sequence.
Args:
features (torch.Tensor): ids of each token,
size ([bs, seq_length]
attention_mask (torch.Tensor): binary tensor, used to select
tokens which are used to compute attention scores
in the self-attention heads, size [bs, seq_length]
head_mask (torch.Tensor): 1.0 in head_mask indicates that
we keep the head, size: [num_heads]
or [num_hidden_layers x num_heads]
Returns:
PyTorch Tensor with predicted class scores
"""
assert attention_mask is not None, "attention mask is none"
# taking BERTModel output
# see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
bert_output = self.model(
input_ids=features, attention_mask=attention_mask, head_mask=head_mask
)
# we only need the hidden state here and don't need
# transformer output, so index 0
seq_output = bert_output[0] # (bs, seq_len, dim)
# mean pooling, i.e. getting average representation of all tokens
pooled_output = seq_output.mean(axis=1) # (bs, dim)
pooled_output = self.dropout(pooled_output) # (bs, dim)
scores = self.classifier(pooled_output) # (bs, num_classes)
return scores
model_dressnet = BertForSequenceClassification_qnlp(
pretrained_model_name=model_name ,
num_classes=num_classes
)
Use CUDA or CPU according to the “device” object.
model_dressnet = model_dressnet.to(device)
specify criterion for the multi-class classification task, optimizer and scheduler
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
model_dressnet.parameters(), lr=float(learn_rate)
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
here we specify that we pass masks to the runner. So model’s forward method will be called with
these arguments passed to it.
runner = SupervisedRunner(input_key=(“features”, “attention_mask”))
#quantum_net_alt, original alternate version based on randomized quantum circuit
finally, training the model with Catalyst
runner.train(
#input_key=(“features”, “attention_mask”),
model=model_dressnet,
criterion=criterion,
optimizer=optimizer,
scheduler=scheduler,
loaders=train_val_loaders,
callbacks=[
AccuracyCallback(num_classes=num_classes),
OptimizerCallback(accumulation_steps=accum_steps),
],
logdir=log_dir,
num_epochs=num_epochs,
verbose=True
)