Hybrid Network not differentiating

Hi I am training a hybrid network with a QNode as PyTorch-Layer. Transfer learning is no problem but I can’t train the whole network. It seems that the pennylane/torch interface tries to differentiate the inputs which results in this error. I inserted a print in the pennyLane/torch.py module to confirm this.
My StackTrace:

ValueError                                Traceback (most recent call last)
<ipython-input-5-846e188427fd> in <module>
     67 optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)
     68 
---> 69 qHist = train(device, hybrid_model, optimizer, criterion, 1, dataloader_train, dataloader_test)
     70 
     71 plotTrainingResults([cHist, qHist], ["classical", "quantum"])

~\Hahn_schickard\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     35             correct += (predicted == labels).float().sum()
     36             loss = criterion(outputs, labels)
---> 37             loss.backward()
     38             optimizer.step()
     39             batch_percentage = i*20//number_batches

~\anaconda3\envs\myenv\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    243                 create_graph=create_graph,
    244                 inputs=inputs)
--> 245         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    246 
    247     def register_hook(self, hook):

~\anaconda3\envs\myenv\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145     Variable._execution_engine.run_backward(
    146         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 147         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    148 
    149 

~\anaconda3\envs\myenv\lib\site-packages\torch\autograd\function.py in apply(self, *args)
     87     def apply(self, *args):
     88         # _forward_cls is defined by derived class
---> 89         return self._forward_cls.backward(self, *args)  # type: ignore
     90 
     91 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in backward(ctx, dy)
    176         if dy.is_cuda:
    177             cuda_device = dy.get_device()
--> 178         vjp = dy.view(1, -1) @ ctx.jacobian.apply(ctx, *ctx.saved_tensors).to(dy)
    179         vjp = torch.unbind(vjp.view(-1))
    180         return (None,) + tuple(vjp)

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in forward(ctx_, parent_ctx, *input_)
    124                 ctx_.dy = parent_ctx.dy
    125                 ctx_.save_for_backward(*input_)
--> 126                 jacobian = _evaluate_grad_matrix("jacobian")
    127                 return jacobian
    128 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in _evaluate_grad_matrix(grad_matrix_fn)
    109             print(ctx.args)
    110             grad_matrix = getattr(tape, grad_matrix_fn)(
--> 111                 device, params=ctx.args, **tape.jacobian_options
    112             )
    113             tape.set_parameters(ctx.all_params, trainable_only=False)

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\qubit_param_shift.py in jacobian(self, device, params, **options)
    122         self._append_evA_tape = True
    123         self._evA_result = None
--> 124         return super().jacobian(device, params, **options)
    125 
    126     def parameter_shift(self, idx, params, **options):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\jacobian_tape.py in jacobian(self, device, params, **options)
    514 
    515         # perform gradient method validation
--> 516         diff_methods = self._grad_method_validation(method)
    517 
    518         if not self._has_trainable_params(params, diff_methods):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\jacobian_tape.py in _grad_method_validation(self, method)
    198 
    199         if nondiff_params:
--> 200             raise ValueError(f"Cannot differentiate with respect to parameter(s) {nondiff_params}")
    201 
    202         numeric_params = {idx for idx, g in diff_methods.items() if g == "F"}

ValueError: Cannot differentiate with respect to parameter(s) {0}

I use a qml.qnnTorchLayer with parameters being inputs, weights; exactly as described in
https://pennylane.readthedocs.io/en/stable/code/api/pennylane.qnn.TorchLayer.html

Here is my code:

def rotation_layer(params, qubits):
    # parametrized ry, rz rotations
    n = len(qubits)
    for i,q in enumerate(qubits):
        qml.RY(params[i], wires=q)
    for i,q in enumerate(qubits):
        qml.RY(params[i+n], wires=q)

def entanglement_layer(qubits):
    n = len(qubits)
    for i in range(0, n):
        qml.CNOT(wires=[qubits[i], qubits[(i+1)%n]])

@qml.qnode(qml.device('default.qubit', wires=4))
def QNode4(inputs, weights):
    inputs = F.normalize(inputs,dim=-1,p=2)  #L2-normalization
    N = len(inputs)
    n = int(m.log(N,2))
    # data encoding
    qml.QubitStateVector(inputs, wires=range(n))
    
    #variational circuit
    measureWires = range(n)
    for w in weights:
        rotation_layer(w, measureWires)
        entanglement_layer(measureWires)

    #measure
    return qml.probs(wires = measureWires)

# model class
class QNet(nn.Module):

    def __init__(self):
        super(QNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 16, 128)  # 5*5 from image dimension
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 16)
        n_qubits = 4
        n_layers = 2
        self.fc4 = qml.qnn.TorchLayer(QNode4, {"weights": (n_layers, n_qubits**2)})
        self.fc5 = nn.Linear(16, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # sqrt because output are amplitudes. This way input=output if there is only data encoding
        x = torch.sqrt(self.fc4(x)).to(device)
        x = self.fc5(x)
        return x
    

#training
hybrid_model = QNet().to(device)
epochs = 16
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)

qHist = train(device, hybrid_model, optimizer, criterion, 1, dataloader_train, dataloader_test)

plotTrainingResults([cHist, qHist], ["classical", "quantum"])

I can’t find a way to make pennylane stop trying to differentiate the inputs and as far as I understand the qml.qnn.TorchLayer class, this shouldn’t be happening in the first pace.

Hi @Daniel63656!

I’m trying to run your code now, but I’m running into an issue where device and train are not defined.

Would you be able to share a minimal non-working example that is executable as-is? That will help me better debug the issue :slight_smile:

Thanks for your quick reply.
Here is a working example (sorry that I have to post it like that, as a new user I can not upload files :frowning:)

import copy
import math as m
import numpy as np
import pennylane as qml
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

## Classical Network

We first build and train a classical network that is build in a way that a linear layer (16x16) can be swapped for a QNode later on

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 16, 128)  # 5*5 from image dimension
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 16)
        self.fc4 = nn.Linear(16, 16)
        self.fc5 = nn.Linear(16, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x
    
    # backward function (where gradients are computed) is automatically defined by pyTorch
    
model = Net()
print(model)
print("")
print("parameters:")
params = list(model.parameters())
for i in range(len(params)):
    print(params[i].size())

# training method
import time
import torch
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 300

# Training function returning training history
def train(device, model, optimizer, criterion, epochs, training, testing=None):
    print('starting...')
    since = time.time()
    history = History()
    
    for epoch in range(epochs):
        correct = 0
        number_batches = len(training)

        if (testing != None):
            valAcc, valCost = evaluate(device, model, criterion, testing)
            history.testAccuracy.append(valAcc)
            history.testCost.append(valCost)
        
        
        for i, data in enumerate(training):  #loop through batches
            inputs, labels = data
            batch_size = len(labels)
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            # forward + backward + optimize
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).float().sum()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            batch_percentage = i*20//number_batches
            print("epoch %d/%d [%-20s]" % (epoch+1,epochs,'='*batch_percentage), end ='\r')
            
        # save statistics
        accuracy = correct / (number_batches*batch_size)
        history.trainingAccuracy.append(accuracy)
        history.trainingCost.append(loss.item())
        print("epoch {}/{} completed              Accuracy: {:.2f}".format(epoch+1, epochs, accuracy), end ='\r')

    # get final trained results
    finalAcc, finalCost = evaluate(device, model, criterion, training)
    history.trainingAccuracy.append(finalAcc)
    history.trainingCost.append(finalCost)
    
    if (testing != None):
         valAcc, valCost = evaluate(device, model, criterion, testing)
         history.testAccuracy.append(valAcc)
         history.testCost.append(valCost)
    
    print("epoch {}/{} completed              Accuracy: {:.2f}".format(epochs, epochs, finalAcc))
    print("Training completed in {:.2f}s".format(time.time() - since))
    return history

def evaluate(device, model, criterion, dataloader):
    correct = 0
    for i, data in enumerate(dataloader):
        inputs, labels = data
        batch_size = len(labels)
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).float().sum()
        loss = criterion(outputs, labels)
    acc = correct / (len(dataloader)*batch_size)
    return acc, loss.item()
    
    
# wrapper class for training history
class History:
    def __init__(self):
        self.trainingAccuracy = list()
        self.trainingCost = list()
        self.testAccuracy = list()
        self.testCost = list()
        
def saveHistory(PATH, name, history):
    save = np.asarray([history.trainingAccuracy, history.trainingCost, history.testAccuracy, history.testCost])
    np.save(PATH + name + ".npy", save)
    
def loadHistory(PATH, name):
    load = np.load(PATH + name + ".npy", allow_pickle=True)
    history = History()
    history.trainingAccuracy = load[0]
    history.trainingCost     = load[1]
    history.testAccuracy     = load[2]
    history.testCost         = load[3]
    return history
    
def plotTrainingResults(histories, labels):
    fig, axs = plt.subplots(2,2, figsize=(12, 5))
    plt.subplots_adjust(hspace = 0.3)
    
    axs[0,0].grid()
    axs[0,1].grid()
    axs[1,0].grid()
    axs[1,1].grid()
    axs[0,0].set_ylim([0, 1.1])
    axs[0,1].set_ylim([0, 1.1])
    axs[0,0].set_title('Training Accuracy')
    axs[0,1].set_title('Test Accuracy')
    axs[1,0].set_title('Training Cost')
    axs[1,1].set_title('Test Cost')
    lines= []
    for hist in histories:
        lines.append(axs[0,0].plot(hist.trainingAccuracy))
        axs[0,1].plot(hist.testAccuracy)
        axs[1,0].plot(hist.trainingCost)
        axs[1,1].plot(hist.testCost)
    
    for ax in axs.flat:
        ax.label_outer()
    
    fig.legend(labels, loc='upper left', prop={'size': 11})
    plt.show()

## MNIST Dataset

batch_size = 8
# use None on these to load whole dataset
training_size_limit = 1000
test_size_limit = 400


trf = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5), (0.5))])

training_data = datasets.MNIST(root='./MNIST', train=True,  download=True, transform=trf)
test_data     = datasets.MNIST(root='./MNIST', train=False, download=True, transform=trf)

if (len(training_data) == 60000 and len(test_data) == 10000):
    print("data loading successful!")

if (training_size_limit != None):
    training_data = torch.utils.data.Subset(training_data, range(training_size_limit))
if (test_size_limit != None):
    test_data     = torch.utils.data.Subset(test_data, range(test_size_limit))

    
# Initialize dataloaders
dataloader_train = torch.utils.data.DataLoader(training_data, batch_size=batch_size, shuffle=True)
dataloader_test  = torch.utils.data.DataLoader(test_data,     batch_size=batch_size, shuffle=True)

## Training

model = Net().to(device)
epochs = 16
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.8)

cHist = train(device, model, optimizer, criterion, epochs, dataloader_train, dataloader_test)

plotTrainingResults([cHist], ["classical"])

## Transfer Learning

class Pretrained(torch.nn.Module):
    def __init__(self, pretrained_model, qLayer):
        super().__init__()
        self.__dict__ = copy.deepcopy(pretrained_model.__dict__)
        # freeze old parameters of model
        params = list(self.parameters())
        for i in range(len(params)-4):
            params[i].requires_grad = False
        # swap layer
        self.fc4 = qLayer
        
    def forward(self,x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # sqrt because output are amplitudes. This way input=output if there is only data encoding
        x = torch.sqrt(self.fc4(x)).to(device)
        x = self.fc5(x)
        return x

#----------Ansatz 4----------#
@qml.qnode(qml.device('default.qubit', wires=4), interface="torch")
def QNode4(inputs, weights):
    inputs = F.normalize(inputs,dim=-1,p=2)  #L2-normalization
    N = len(inputs)
    n = int(m.log(N,2))
    # data encoding
    qml.QubitStateVector(inputs, wires=range(n))
    
    #variational circuit
    measureWires = range(n)
    for w in weights:
        rotation_layer(w, measureWires)
        entanglement_layer(measureWires)

    #measure
    return qml.probs(wires = measureWires)


#create new model
n_qubits = 4
n_layers = 2
qLayer = qml.qnn.TorchLayer(QNode4, {"weights": (n_layers, n_qubits*2)})
pretrained = Pretrained(model, qLayer).to(device)

#train
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(pretrained.parameters(), lr=0.01, momentum=0.8)
a4Hist = train(device, pretrained, optimizer, criterion, 5, dataloader_train)

## Train the whole model

def rotation_layer(params, qubits):
    # parametrized ry, rz rotations
    n = len(qubits)
    for i,q in enumerate(qubits):
        qml.RY(params[i], wires=q)
    for i,q in enumerate(qubits):
        qml.RY(params[i+n], wires=q)

def entanglement_layer(qubits):
    n = len(qubits)
    for i in range(0, n):
        qml.CNOT(wires=[qubits[i], qubits[(i+1)%n]])

@qml.qnode(qml.device('default.qubit', wires=4))
def QNode4(inputs, weights):
    inputs = F.normalize(inputs,dim=-1,p=2)  #L2-normalization
    N = len(inputs)
    n = int(m.log(N,2))
    # data encoding
    qml.QubitStateVector(inputs, wires=range(n))
    
    #variational circuit
    measureWires = range(n)
    for w in weights:
        rotation_layer(w, measureWires)
        entanglement_layer(measureWires)

    #measure
    return qml.probs(wires = measureWires)

# model class
class QNet(nn.Module):

    def __init__(self):
        super(QNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 16, 128)  # 5*5 from image dimension
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 16)
        n_qubits = 4
        n_layers = 2
        self.fc4 = qml.qnn.TorchLayer(QNode4, {"weights": (n_layers, n_qubits**2)})
        self.fc5 = nn.Linear(16, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # sqrt because output are amplitudes. This way input=output if there is only data encoding
        x = torch.sqrt(self.fc4(x)).to(device)
        x = self.fc5(x)
        return x
    

#training
hybrid_model = QNet().to(device)
epochs = 16
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)

qHist = train(device, hybrid_model, optimizer, criterion, 1, dataloader_train, dataloader_test)

plotTrainingResults([cHist, qHist], ["classical", "quantum"])


No worries @Daniel63656! Thanks for posting the full code.

I think I’ve zeroed in on the issue; in QNode4, the input tensor is being encoded in the circuit using qml.QubitStateVector, which does not support differentiation via the parameter-shift rule, unfortunately:

import pennylane as qml
import torch

dev = qml.device("default.qubit", wires=2)

@qml.qnode(dev, interface="torch")
def circuit(inputs, weights):
    qml.QubitStateVector(inputs, wires=[0, 1])
    qml.templates.StronglyEntanglingLayers(weights, wires=[0, 1])
    return qml.probs(wires=[0, 1])

inputs = torch.tensor([0.1, 0.2, 0.3, 0.4], requires_grad=True)
inputs = inputs / torch.linalg.norm(inputs)
weights = torch.ones([3, 2, 3], requires_grad=True)

>>> res = torch.sum(torch.sin(circuit(inputs, weights)))
>>> res.backward()
  File "/home/josh/xanadu/pennylane/pennylane/tape/jacobian_tape.py", line 205, in _grad_method_validation
    raise ValueError(f"Cannot differentiate with respect to parameter(s) {nondiff_params}")
ValueError: Cannot differentiate with respect to parameter(s) {0}

One ‘workaround’ is to instead use a state preparation ansatz or template. That is, a state preparation that uses a set of unitary gates to prepare the state from the ground state. Because it decomposes down into gates that do support the parameter-shift rule, the overall state preparation is now differentiable.

One example in PennyLane is the qml.MottonenStatePreparation template:

dev = qml.device("default.qubit", wires=2)

@qml.qnode(dev, interface="torch")
def circuit(inputs, weights):
    qml.templates.MottonenStatePreparation(inputs, wires=[0, 1])
    qml.templates.StronglyEntanglingLayers(weights, wires=[0, 1])
    return qml.probs(wires=[0, 1])

inputs = torch.tensor([0.1, 0.2, 0.3, 0.4], requires_grad=True)
inputs = inputs / torch.linalg.norm(inputs)
weights = torch.ones([3, 2, 3], requires_grad=True)

However, the Mottonen state preparation template is currently not fully tested for differentiability at the moment, which we are working on. So the above code will still fail, but with a different error :frowning:

However, it is just a issue with the dtype; adding the following to line 206 of pennylane/templates/state_preparations/mottonen.py,

index 860eeb80..a7a83544 100644
--- a/pennylane/templates/state_preparations/mottonen.py
+++ b/pennylane/templates/state_preparations/mottonen.py
@@ -203,6 +203,9 @@ def _get_alpha_y(a, n, k):
     with np.errstate(divide="ignore", invalid="ignore"):
         division = numerator / denominator

+    division = qml.math.cast(division, np.float64)
+    denominator = qml.math.cast(denominator, np.float64)
+
     division = qml.math.where(denominator != 0.0, division, 0.0)

     return 2 * qml.math.arcsin(qml.math.sqrt(division))

fixes the bug :slight_smile:

>>> res = torch.sum(torch.sin(circuit(inputs, weights)))
>>> res.backward()
>>> print(weights.grad)
tensor([[[ 3.8434e-03, -2.5692e-03, -2.9169e-03],
         [ 2.2742e-03, -6.6792e-03, -5.7919e-03]],

        [[-5.7919e-03, -1.3341e-02, -1.2886e-02],
         [ 2.1471e-02,  1.0306e-02,  1.6321e-02]],

        [[ 1.6321e-02,  1.3402e-02,  3.9293e-17],
         [-8.7039e-03,  1.7823e-02, -3.4684e-18]]])

I will make a PR to solve this bug; in the meantime, feel free to apply the fix I posted above directly to the mottonen.py file.

Thanks josh,

I have my own implementation of a data amplitude encoding circuit and i noticed that using it doesn’t lead to the error (forgot to mention that, sorry). However mine is way slower.

But I am not sure if I understand the problem correctly. The inputs parameter should be a non-trainable parameter and therefore not differentiated at all.
Are you saying that the qml.QubitStateVector method prevents the parameter-shift differentiation applied to the parameters (simply because it is part of the circuit?)

Also, are you familiar with qiskits amplitude embedding implementation? The circuit can be build out of U3 and CNOT gates only and should therefore be differentiable. I assumed pennyLane uses a similiar implementation.

https://qiskit.org/documentation/stubs/qiskit.extensions.Initialize.html

Also, are you familiar with qiskits amplitude embedding implementation?

I am familiar with the function, but not the decomposition they are using. However I assume it must be similar to ours, yes. The one we use is described here: https://arxiv.org/pdf/quant-ph/0407010.pdf

But I am not sure if I understand the problem correctly. The inputs parameter should be a non-trainable parameter and therefore not differentiated at all.

It might be good to verify this in the code if possible. Perhaps something in your model is causing the QNode inputs to become trainable? This can happen if the inputs to the QNode are the output of a function that is itself differentiable :thinking:

If the input is not differentiable, then it should have the setting requires_grad=False. PennyLane ignores Torch tensors that have requires_grad=False, so the following works correctly:

import pennylane as qml
import torch

dev = qml.device("default.qubit", wires=2)

@qml.qnode(dev, interface="torch")
def circuit(inputs, weights):
    qml.QubitStateVector(inputs, wires=[0, 1])
    qml.templates.StronglyEntanglingLayers(weights, wires=[0, 1])
    return qml.probs(wires=[0, 1])

# set the inputs as non-differentiable
inputs = torch.tensor([0.1, 0.2, 0.3, 0.4], requires_grad=False)
inputs = inputs / torch.linalg.norm(inputs)

weights = torch.ones([3, 2, 3], requires_grad=True)

res = torch.sum(torch.sin(circuit(inputs, weights)))
res.backward()
print(weights.grad)

I know this example but because I am using the qml.qnn.TorchLayer in a network class, I can’t implement it like this (at least I don’t know how).

But from the qml.qnnTorchLayer docu:

The signature of the QNode must contain an inputs named argument for input data, with all other arguments to be treated as internal weights.

So I assumed this to be working properly. Also like I said I inserted the line

print(ctx.args)

in line 109 of pennylane\interfaces\torch.py

to see what the interface tries to differentiate. Using QubitStateVector results in

[array([0. , 0.11698876, 0.12460404, 0. , 0.29290894,
0.5918856 , 0.467779 , 0. , 0. , 0. ,
0. , 0.5444552 , 0.13933586, 0. , 0. ,
0. ], dtype=float32), 3.524852752685547, 0.36004096269607544, 1.1748652458190918, 4.160643577575684, 0.32508784532546997, 1.1772842407226562, 4.3311238288879395, 0.13922111690044403, 5.3807806968688965, 4.490009307861328, 3.0298593044281006, 5.180414199829102, 1.7754203081130981, 4.418515205383301, 4.486685276031494, 1.1825096607208252]

where one can clearly see the inputs (array) in the list of stuff to be differentiated. When I use my own encoding, this array is missing and there are only the weights of the QNode in this list (than the network trains).

In another forum someone suggested making inputs a keyword argument. Didn’t help.
I’m really out of things to try by now. There seems to be no documented case of how to use a QNode in an existing model, with certainty that pennyLane won’t try to differentiate the inputs.

By the way I used this call to decompose qiskit.initialize into U3 gates. Maybe it helps.

#make some input vector x
qc = QuantumCircuit(n)
# this function takes care of amplitude encoding in a highly efficient way
qc.initialize(x, [k for k in range(n)])
qc.measure_all()

# present the circuit with U-Gates
qc = transpile(qc, basis_gates=['u1', 'u2', 'u3', 'cx'])
qc.draw(output='mpl')

Hi @Daniel63656!

I’m joining the discussion a bit late so was wondering if we could rewind a bit.

But I am not sure if I understand the problem correctly. The inputs parameter should be a non-trainable parameter and therefore not differentiated at all.

Why do you expect that to be the case? You have a quantum network defined as:

def forward(self, x):
    # Max pooling over a (2, 2) window
    x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
    # If the size is a square, you can specify with a single number
    x = F.max_pool2d(F.relu(self.conv2(x)), 2)
    x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = F.relu(self.fc3(x))
    # sqrt because output are amplitudes. This way input=output if there is only data encoding
    x = torch.sqrt(self.fc4(x)).to(device)
    x = self.fc5(x)
    return x

Given the line x = torch.sqrt(self.fc4(x)).to(device) where self.fc4 is a TorchLayer, don’t we expect to differentiate the TorchLayer with respect to its inputs so that we can access the derivative of QNet with respect to weights of, e.g., fc1 or fc2? In that case, we need the amplitude embedding of inputs to be differentiable, so working out an alternative to QubitStateVector like MottonenStatePreparation makes sense.

Hi Tom,

this would explain why I can train without problems when I freeze all previous layers. Otherwise autograd calculates del Cost / del inputs? I thought in backpropagation this does not result in the actual differentiation (rather inputs-labels for cross entropy for example).

When I use my own data embedding this seemed not to be a problem.
I don’t use the inputs directly in the circuit, I rather construct angles out of them in the function findAngles(x) . In QNode1 I use the embedding layer also for the variational circuit (so weights are also used to calculate angles). When I insert a

print(model.fc4.weights.grad)

after optimizer.step() in the training, all the gradients are None. This seemed logical to me, because the weights don’t end up directly in the circuit and I could see why autograd can’t follow along.
In QNode2 I just removed the weights to angles method and put the weights straight in the variation layer. I have gradients now (but mostly zero). But now knowing, that inputs also need to be differentiable, the above problem extends to them as well. So using QNode2 will also hamper training, because autograd still can’t make del Cost / del inputs? So I guess in this case the gradient is just silently None instead of throwing an error?

Here the code

def rotation_layer(params, qubits):
    # parametrized ry, rz rotations
    n = len(qubits)
    for i,q in enumerate(qubits):
        qml.RY(params[i], wires=q)
    for i,q in enumerate(qubits):
        qml.RY(params[i+n], wires=q)

def entanglement_layer(qubits):
    n = len(qubits)
    for i in range(0, n):
        qml.CNOT(wires=[qubits[i], qubits[(i+1)%n]])
        
# getting rotation angles from params
def findAngles(x):       # cost: O(n)
    if (len(x) > 1):
        # auxilary vector v with 2^(n-1) = N/2 dimensions
        v = [m.sqrt(abs(x[2*k])**2 + abs(x[2*k+1])**2) for k in range(len(x)//2)]
        inner_angles = findAngles(v)
        # output vector angles with N/2 dimesnions
        angles = []
        for k in range(len(v)):
            if (v[k] != 0):
                if (x[2*k] > 0):
                    angles.append(2*m.asin(x[2*k+1]/v[k]))
                else:
                    angles.append(2*m.pi-2*m.asin(x[2*k+1]/v[k]))
            else:
                angles.append(0)
        if (inner_angles != None):   #this appends the lists if inner_angles isn't empty
            angles = inner_angles + angles
        return angles
    
def dataEmbedding(angles, N):
    n = int(m.log(N,2))
    
    qml.RY(angles[0], wires=0)   #apply first Ry which isn't controlled
    rep = 2
    idx = 1
    for num_con in range(1,n):
        for i in range(rep):
            # calculate control pattern 
            conditionState = idx-(2**num_con-1)
            binary_index = '{:0{}b}'.format(conditionState, num_con)
            # calculate matrix of RY-Gate
            phi = angles[idx]/2
            U = np.array([[m.cos(phi),  m.sin(phi)], [-m.sin(phi),  m.cos(phi)]])
            # apply. Pattern in binary form can be given as parameter directly
            qml.ControlledQubitUnitary(U, control_wires=range(num_con), 
                                       wires=num_con, control_values=binary_index)
            idx += 1
        rep*=2

# create the QNode
@qml.qnode(qml.device('default.qubit', wires=4), interface="torch")
def QNode1(inputs, weights):
    inputs = F.normalize(inputs,dim=-1,p=2)  #L2-normalization
    N = len(inputs)
    n = int(m.log(N,2))
    # data encoding
    angles = findAngles(inputs)
    dataEmbedding(angles, N)
    
    #variational circuit
    measureWires = range(n)
    for w in weights:
        angles = findAngles(w)
        dataEmbedding(angles, N)

    #measure
    return qml.probs(wires = measureWires)

# create the QNode
@qml.qnode(qml.device('default.qubit', wires=4), interface="torch")
def QNode2(inputs, weights):
    inputs = F.normalize(inputs,dim=-1,p=2)  #L2-normalization
    N = len(inputs)
    n = int(m.log(N,2))
    # data encoding
    dataEmbedding(inputs, N)
    
    #variational circuit
    measureWires = range(n)
    for w in weights:
        dataEmbedding(w, N)

    #measure
    return qml.probs(wires = measureWires)

# model class
class QNet(nn.Module):

    def __init__(self):
        super(QNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 16, 128)  # 5*5 from image dimension
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 16)
        n_qubits = 4
        n_layers = 2
        self.fc4 = qml.qnn.TorchLayer(QNode2, {"weights": (n_layers, n_qubits**2)})
        self.fc5 = nn.Linear(16, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        # sqrt because output are amplitudes. This way input=output if there is only data encoding
        x = torch.sqrt(self.fc4(x)).to(device)
        x = self.fc5(x)
        return x
    

#training
hybrid_model = QNet().to(device)
epochs = 16
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)

qHist = tran(device, hybrid_model, optimizer, criterion, 1, dataloader_train, dataloader_test)

plotTrainingResults([cHist, qHist], ["classical", "quantum"])

Hi @Daniel63656,

Yes, in general you should aim for your QNode to be differentiable with respect to inputs. You can do this by making sure that every operation you apply to inputs (and the trainable parameters) is differentiable. In QNode1, it looks like findAngles is not differentiable because of use of m.sqrt (assuming m is Python’s math module) - you should swap out these cases with torch-compatible functionality (see here for example). It looks like the dataEmbedding may also be breaking differentiability with use of the math module, so this could be giving you None or zero gradients.

I’d suggest breaking things up a little and checking that you can find the gradient with respect to findAngles and dataEmbedding before constructing the full model.

I still don’t get why we would need to differentiate the inputs.

I thought in backpropagation this does not result in the actual differentiation (rather inputs-labels for cross entropy for example)

Is this a special thing of autograd?

I’d suggest breaking things up a little and checking that you can find the gradient with respect to findAngles and dataEmbedding before constructing the full model.

How would I do that? So far I was only able to print weights.grad. model.fc4.inputs is “not defined”

Hey @Daniel63656,

I thought in backpropagation this does not result in the actual differentiation (rather inputs-labels for cross entropy for example)

I’m not quite sure what you mean here, do you have a reference for this intuition?

Differentiating the inputs of a layer in a sequential model is a common feature and not particular to quantum or PennyLane. Check out this for a bit more detail.

How would I do that? So far I was only able to print weights.grad. model.fc4.inputs is “not defined”

The following example shows how findAngles can be set up to be compatible with torch differentiation:

import math as m
import torch

def findAngles(x):       # cost: O(n)
    if (len(x) > 1):
        # auxilary vector v with 2^(n-1) = N/2 dimensions
        v = [torch.sqrt(abs(x[2*k])**2 + torch.abs(x[2*k+1])**2) for k in range(len(x)//2)]
        inner_angles = findAngles(v)
        # output vector angles with N/2 dimesnions
        angles = []
        for k in range(len(v)):
            if (v[k] != 0):
                if (x[2*k] > 0):
                    angles.append(2*torch.asin(x[2*k+1]/v[k]))
                else:
                    angles.append(2*m.pi-2*torch.asin(x[2*k+1]/v[k]))
            else:
                angles.append(0)
        if (inner_angles != None):   #this appends the lists if inner_angles isn't empty
            angles = inner_angles + angles
        return angles
    
x = torch.ones(8, requires_grad=True)
out = torch.stack(findAngles(x))

loss = torch.sum(out)
loss.backward()

x.grad

If we had used, for example, m.sqrt(abs(x[2*k])**2 rather than torch.sqrt(abs(x[2*k])**2, the above would raise an error.

@Daniel63656, does the following example help motivate why we differentiate with respect to the inputs of a layer:

Consider a cost function C(x, w1, w2) = f2(w2, f1(w1, x)), where f2(w2, x) and f1(w1, x) are layers, w1 and w2 are trainable weights, and x is the model input.

We want to find dC / dw2 and dC / dw1. Finding the former is easy, while the latter can be done with the chain rule:

dC / dw1 = (d f2 / d o1) * (d o1 / d w1), where o1 is the output of the first layer o1 = f1(w1, x). We hence need to evaluate d f2 / d o1, which is the derivative of the output of layer 2 with respect to its input.

When I use the modified findAngles(x) method to train the whole network with QNode3, I get following error:

ValueError                                Traceback (most recent call last)
<ipython-input-30-8e86a1aed95a> in <module>
    109 optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)
    110 
--> 111 qHist = train(device, hybrid_model, optimizer, criterion, epochs, dataloader_train, dataloader_test)
    112 
    113 plotTrainingResults([cHist, qHist], ["classical", "quantum"])

~\Hahn_schickard\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     31             optimizer.zero_grad()
     32             # forward + backward + optimize
---> 33             outputs = model(inputs)
     34             _, predicted = torch.max(outputs.data, 1)
     35             correct += (predicted == labels).float().sum()

~\anaconda3\envs\myenv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

<ipython-input-30-8e86a1aed95a> in forward(self, x)
     98         x = F.relu(self.fc3(x))
     99         # sqrt because output are amplitudes. This way input=output if there is only data encoding
--> 100         x = torch.sqrt(self.fc4(x)).to(device)
    101         x = self.fc5(x)
    102         return x

~\anaconda3\envs\myenv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    272             reconstructor = []
    273             for x in torch.unbind(inputs):
--> 274                 reconstructor.append(self.forward(x))
    275             return torch.stack(reconstructor)
    276 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    276 
    277         # If the input is 1-dimensional, calculate the forward pass as usual
--> 278         return self._evaluate_qnode(inputs)
    279 
    280     def _evaluate_qnode(self, x):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in _evaluate_qnode(self, x)
    291             **{arg: weight.to(x) for arg, weight in self.qnode_weights.items()},
    292         }
--> 293         return self.qnode(**kwargs).type(x.dtype)
    294 
    295     def __str__(self):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnode.py in __call__(self, *args, **kwargs)
    553 
    554         # execute the tape
--> 555         res = self.qtape.execute(device=self.device)
    556 
    557         if original_shots is not None:

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\tape.py in execute(self, device, params)
   1262             params = self.get_parameters()
   1263 
-> 1264         return self._execute(params, device=device)
   1265 
   1266     def execute_device(self, params, device):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in _execute(self, params, **kwargs)
    256     def _execute(self, params, **kwargs):
    257         kwargs["tape"] = self
--> 258         res = _TorchInterface.apply(kwargs, *params)
    259         return res
    260 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in forward(ctx, input_kwargs, *input_)
     71         # evaluate the tape
     72         tape.set_parameters(ctx.all_params_unwrapped, trainable_only=False)
---> 73         res = tape.execute_device(ctx.args, device)
     74         tape.set_parameters(ctx.all_params, trainable_only=False)
     75 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\tape.py in execute_device(self, params, device)
   1293 
   1294         if isinstance(device, qml.QubitDevice):
-> 1295             res = device.execute(self)
   1296         else:
   1297             res = device.execute(self.operations, self.observables, {})

~\anaconda3\envs\myenv\lib\site-packages\pennylane\_qubit_device.py in execute(self, circuit, **kwargs)
    182 
    183         # apply all circuit operations
--> 184         self.apply(circuit.operations, rotations=circuit.diagonalizing_gates, **kwargs)
    185 
    186         # generate computational basis samples

~\anaconda3\envs\myenv\lib\site-packages\pennylane\devices\default_qubit.py in apply(self, operations, rotations, **kwargs)
    190                 self._apply_basis_state(operation.parameters[0], operation.wires)
    191             else:
--> 192                 self._state = self._apply_operation(self._state, operation)
    193 
    194         # store the pre-rotated state

~\anaconda3\envs\myenv\lib\site-packages\pennylane\devices\default_qubit.py in _apply_operation(self, state, operation)
    215             return self._apply_ops[operation.base_name](state, axes, inverse=operation.inverse)
    216 
--> 217         matrix = self._get_unitary_matrix(operation)
    218 
    219         if isinstance(operation, DiagonalOperation):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\devices\default_qubit.py in _get_unitary_matrix(self, unitary)
    405             return unitary.eigvals
    406 
--> 407         return unitary.matrix
    408 
    409     @classmethod

~\anaconda3\envs\myenv\lib\site-packages\pennylane\operation.py in matrix(self)
    660     @property
    661     def matrix(self):
--> 662         op_matrix = self._matrix(*self.parameters)
    663 
    664         if self.inverse:

~\anaconda3\envs\myenv\lib\site-packages\pennylane\ops\qubit.py in _matrix(cls, *params)
   1879 
   1880         if not np.allclose(U @ U.conj().T, np.identity(U.shape[0])):
-> 1881             raise ValueError("Operator must be unitary.")
   1882 
   1883         return U

ValueError: Operator must be unitary.

With the old findAngles(x) method this doesn’t happen. But the math stayed exactly the same?!


I understand that the chainrule requires you to calculate d f2/ d o1. I just thought that term can by replaced by a term specified by the cost function, simply containing the inputs.

For example given the quadratic cost C=(y-o)**2 / 2 and a layer with weights w, biases b, input a, w*a+b=z, o = output, labels y and sigma as activation function, than

d C/d w = (o-y)* d sigma(z)*a
d C/d b = (o-y)* d sigma(z)

So in order to do backpropagation, you can substitute the terms containing input derivatives with something cost function specific without actually differentiating with respect to inputs.

I guess autograd doesn’t do that, in order to be able to differentiate any network, regardeless of the cost functions, layer types,… (to be automatic differentiation)?

@Daniel63656, your error looks like an issue with qml.QubitUnitary or qml.ControlledQubitUnitary. Perhaps it is your use of qml.ControlledQubitUnitary in the following function:

def dataEmbedding(angles, N):
    n = int(m.log(N,2))
    
    qml.RY(angles[0], wires=0)   #apply first Ry which isn't controlled
    rep = 2
    idx = 1
    for num_con in range(1,n):
        for i in range(rep):
            # calculate control pattern 
            conditionState = idx-(2**num_con-1)
            binary_index = '{:0{}b}'.format(conditionState, num_con)
            # calculate matrix of RY-Gate
            phi = angles[idx]/2
            U = np.array([[m.cos(phi),  m.sin(phi)], [-m.sin(phi),  m.cos(phi)]])
            # apply. Pattern in binary form can be given as parameter directly
            qml.ControlledQubitUnitary(U, control_wires=range(num_con), 
                                       wires=num_con, control_values=binary_index)
            idx += 1
        rep*=2

Maybe worth checking what U looks like.

Note that the above function should also be torch-compatible. I don’t think using qml.ControlledQubitUnitary is compatible with differentiation when using the torch interface, so it may be good to think of alternatives to using this gate, which may for example include MultiControlledX and a single qubit rotation, or PauliRot.

I noticed, that after the first loss.backward(), the input vector x becomes all naN after the first maxpool layer (first layer in the model). By the time it reaches the QNode U is now obviously not unitary.

I don’t think using qml.ControlledQubitUnitary is compatible with differentiation when using the torch interface

Do you think this can cause that issue?

MultiControlledX and a single qubit rotation, or PauliRot

Could you point me in the right direction on how I could do that?

Hi @Daniel63656,

Do you think this can cause that issue?

Quite probably, the cause is the unsupported differentiablity as Tom suggested previously.

Could you perhaps have a look if by adding the following code, further details are revealed?

from torch import autograd
autograd.set_detect_anomaly(True)

This can be helpful when the backward pass values become nan using Torch.


On the note of using a different differentiable set of operations, it seems that after the first RY operation, the dataEmbedding function applies uniformly controlled RY operations by using the qml.ControlledQubitUnitary. Uniformly controlled RY operations are used in the previously mentioned MottonenStatePreparation circuit too. Applying a uniformly controlled RY operation can be substituted with CNOTs and RY operations: see Figures 1-2 in original paper from Mottonen, et al.

If I use MottonenStatePreparation I get another error

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-54-2d0d62781f31> in <module>
     51 criterion = nn.CrossEntropyLoss()
     52 optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)
---> 53 q2Hist = train(device, hybrid_model, optimizer, criterion, epochs, sparse_train)

~\Hahn_schickard\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     31             optimizer.zero_grad()
     32             # forward + backward + optimize
---> 33             outputs = model(inputs)
     34             _, predicted = torch.max(outputs.data, 1)
     35             correct += (predicted == labels).float().sum()

~\anaconda3\envs\myenv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

<ipython-input-54-2d0d62781f31> in forward(self, x)
     42         x = F.relu(self.fc2(x))
     43         x = F.relu(self.fc3(x))
---> 44         x = self.fc4(x)
     45         x = self.fc5(x)
     46         return x

~\anaconda3\envs\myenv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    272             reconstructor = []
    273             for x in torch.unbind(inputs):
--> 274                 reconstructor.append(self.forward(x))
    275             return torch.stack(reconstructor)
    276 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    276 
    277         # If the input is 1-dimensional, calculate the forward pass as usual
--> 278         return self._evaluate_qnode(inputs)
    279 
    280     def _evaluate_qnode(self, x):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnn\torch.py in _evaluate_qnode(self, x)
    291             **{arg: weight.to(x) for arg, weight in self.qnode_weights.items()},
    292         }
--> 293         return self.qnode(**kwargs).type(x.dtype)
    294 
    295     def __str__(self):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnode.py in __call__(self, *args, **kwargs)
    550         if self.mutable or self.qtape is None:
    551             # construct the tape
--> 552             self.construct(args, kwargs)
    553 
    554         # execute the tape

~\anaconda3\envs\myenv\lib\site-packages\pennylane\qnode.py in construct(self, args, kwargs)
    530             self.qtape = self.qtape.expand(
    531                 depth=self.max_expansion,
--> 532                 stop_at=lambda obj: not isinstance(obj, qml.tape.QuantumTape)
    533                 and self.device.supports_operation(obj.name),
    534             )

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\tape.py in expand(self, depth, stop_at, expand_measurements)
    557         """
    558         new_tape = expand_tape(
--> 559             self, depth=depth, stop_at=stop_at, expand_measurements=expand_measurements
    560         )
    561         new_tape._update()

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\tape.py in expand_tape(tape, depth, stop_at, expand_measurements)
    193                 # Object is an operation; query it for its expansion
    194                 try:
--> 195                     obj = obj.expand()
    196                 except NotImplementedError:
    197                     # Object does not define an expansion; treat this as

~\anaconda3\envs\myenv\lib\site-packages\pennylane\templates\state_preparations\mottonen.py in expand(self)
    278             # Apply inverse y rotation cascade to prepare correct absolute values of amplitudes
    279             for k in range(len(wires_reverse), 0, -1):
--> 280                 alpha_y_k = _get_alpha_y(a, len(wires_reverse), k)
    281                 control = wires_reverse[k:]
    282                 target = wires_reverse[k - 1]

~\anaconda3\envs\myenv\lib\site-packages\pennylane\templates\state_preparations\mottonen.py in _get_alpha_y(a, n, k)
    204         division = numerator / denominator
    205 
--> 206     division = qml.math.where(denominator != 0.0, division, 0.0)
    207 
    208     return 2 * qml.math.arcsin(qml.math.sqrt(division))

~\anaconda3\envs\myenv\lib\site-packages\pennylane\math\fn.py in where(condition, x, y)
    969     tensor([ 0.6000,  0.2300,  0.7000, -4.0000, -5.0000], grad_fn=<SWhereBackward>)
    970     """
--> 971     return _get_multi_tensorbox([x, y]).where(condition, x, y, wrap_output=False)

~\anaconda3\envs\myenv\lib\site-packages\pennylane\math\tensorbox.py in _wrapper(*args, **kwargs)
     38             return cls(func(*args, **kwargs))
     39 
---> 40         return func(*args, **kwargs)
     41 
     42     return _wrapper

~\anaconda3\envs\myenv\lib\site-packages\pennylane\math\torch_box.py in where(condition, x, y)
    196     @wrap_output
    197     def where(condition, x, y):
--> 198         return torch.where(TorchBox.astensor(condition), *TorchBox.unbox_list([x, y]))

RuntimeError: expected scalar type float but found double

I can’t simply change the return type since a QNode must return a measurement

Hi @Daniel63656,

Created a branch for the changes that Josh mentioned previously for aiding the issue that you see, could you try installing this version of PennyLane and checking if the error persists? That would be only until we’ve fully incorporated it.

The version on the branch can be installed via

pip install git+https://github.com/PennyLaneAI/pennylane.git@mottonen_cast_fix

installing this PennyLane version gives another error:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-9-5196e0249258> in <module>
     50 criterion = nn.CrossEntropyLoss()
     51 optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)
---> 52 q3Hist = train(device, hybrid_model, optimizer, criterion, epochs, sparse_train)

~\Hahn_schickard\quantencomputing\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     31             optimizer.zero_grad()
     32             # forward + backward + optimize
---> 33             outputs = model(inputs)
     34             _, predicted = torch.max(outputs.data, 1)
     35             correct += (predicted == labels).float().sum()

~\anaconda3\envs\quantum\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

<ipython-input-9-5196e0249258> in forward(self, x)
     41         x = F.relu(self.fc2(x))
     42         x = F.relu(self.fc3(x))
---> 43         x = self.fc4(x)
     44         x = self.fc5(x)
     45         return x

~\anaconda3\envs\quantum\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
    887             result = self._slow_forward(*input, **kwargs)
    888         else:
--> 889             result = self.forward(*input, **kwargs)
    890         for hook in itertools.chain(
    891                 _global_forward_hooks.values(),

~\anaconda3\envs\quantum\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    272             reconstructor = []
    273             for x in torch.unbind(inputs):
--> 274                 reconstructor.append(self.forward(x))
    275             return torch.stack(reconstructor)
    276 

~\anaconda3\envs\quantum\lib\site-packages\pennylane\qnn\torch.py in forward(self, inputs)
    276 
    277         # If the input is 1-dimensional, calculate the forward pass as usual
--> 278         return self._evaluate_qnode(inputs)
    279 
    280     def _evaluate_qnode(self, x):

~\anaconda3\envs\quantum\lib\site-packages\pennylane\qnn\torch.py in _evaluate_qnode(self, x)
    291             **{arg: weight.to(x) for arg, weight in self.qnode_weights.items()},
    292         }
--> 293         return self.qnode(**kwargs).type(x.dtype)
    294 
    295     def __str__(self):

~\anaconda3\envs\quantum\lib\site-packages\pennylane\qnode.py in __call__(self, *args, **kwargs)
    550         if self.mutable or self.qtape is None:
    551             # construct the tape
--> 552             self.construct(args, kwargs)
    553 
    554         # execute the tape

~\anaconda3\envs\quantum\lib\site-packages\pennylane\qnode.py in construct(self, args, kwargs)
    530             self.qtape = self.qtape.expand(
    531                 depth=self.max_expansion,
--> 532                 stop_at=lambda obj: not isinstance(obj, qml.tape.QuantumTape)
    533                 and self.device.supports_operation(obj.name),
    534             )

~\anaconda3\envs\quantum\lib\site-packages\pennylane\tape\tape.py in expand(self, depth, stop_at, expand_measurements)
    557         """
    558         new_tape = expand_tape(
--> 559             self, depth=depth, stop_at=stop_at, expand_measurements=expand_measurements
    560         )
    561         new_tape._update()

~\anaconda3\envs\quantum\lib\site-packages\pennylane\tape\tape.py in expand_tape(tape, depth, stop_at, expand_measurements)
    193                 # Object is an operation; query it for its expansion
    194                 try:
--> 195                     obj = obj.expand()
    196                 except NotImplementedError:
    197                     # Object does not define an expansion; treat this as

~\anaconda3\envs\quantum\lib\site-packages\pennylane\templates\state_preparations\mottonen.py in expand(self)
    278             # Apply inverse y rotation cascade to prepare correct absolute values of amplitudes
    279             for k in range(len(wires_reverse), 0, -1):
--> 280                 alpha_y_k = _get_alpha_y(a, len(wires_reverse), k)
    281                 control = wires_reverse[k:]
    282                 target = wires_reverse[k - 1]

~\anaconda3\envs\quantum\lib\site-packages\pennylane\templates\state_preparations\mottonen.py in _get_alpha_y(a, n, k)
    204         division = numerator / denominator
    205 
--> 206     division = qml.math.where(denominator != 0.0, division, 0.0)
    207 
    208     return 2 * qml.math.arcsin(qml.math.sqrt(division))

~\anaconda3\envs\quantum\lib\site-packages\pennylane\math\fn.py in where(condition, x, y)
    969     tensor([ 0.6000,  0.2300,  0.7000, -4.0000, -5.0000], grad_fn=<SWhereBackward>)
    970     """
--> 971     return _get_multi_tensorbox([x, y]).where(condition, x, y, wrap_output=False)

~\anaconda3\envs\quantum\lib\site-packages\pennylane\math\tensorbox.py in _wrapper(*args, **kwargs)
     38             return cls(func(*args, **kwargs))
     39 
---> 40         return func(*args, **kwargs)
     41 
     42     return _wrapper

~\anaconda3\envs\quantum\lib\site-packages\pennylane\math\torch_box.py in where(condition, x, y)
    196     @wrap_output
    197     def where(condition, x, y):
--> 198         return torch.where(TorchBox.astensor(condition), *TorchBox.unbox_list([x, y]))

RuntimeError: expected scalar type float but found double

Using Pennylane with cuda has suddenly stopped working entirely for some reason, even after creating a fresh environment. I know that using cuda with pennylane can go wrong but so far the same code has always worked on the gpu (I think the qLayer was executed on the CPU only, now this causes errors).

I also tried using DiagonalQubitUnitary for embedding

def diagonal_embedding(inputs, qubits):
    for q in qubits:
        qml.Hadamard(wires=q)
    real = torch.cos(inputs)
    imag = torch.sin(inputs)
    inputs = (real + imag*1j).clone().detach().requires_grad_(True)
    qml.DiagonalQubitUnitary(inputs, wires=qubits)

Again not differentiable :frowning:

ValueError                                Traceback (most recent call last)
<ipython-input-22-b4ba44eb2ba7> in <module>
     56 criterion = nn.CrossEntropyLoss()
     57 optimizer = optim.SGD(hybrid_model.parameters(), lr=0.01, momentum=0.8)
---> 58 q3Hist = train(device, hybrid_model, optimizer, criterion, epochs, sparse_train, sparse_test)

~\Hahn_schickard\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     35             correct += (predicted == labels).float().sum()
     36             loss = criterion(outputs, labels)
---> 37             loss.backward()
     38             optimizer.step()
     39             batch_percentage = i*20//number_batches

~\anaconda3\envs\myenv\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    243                 create_graph=create_graph,
    244                 inputs=inputs)
--> 245         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    246 
    247     def register_hook(self, hook):

~\anaconda3\envs\myenv\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145     Variable._execution_engine.run_backward(
    146         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 147         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    148 
    149 

~\anaconda3\envs\myenv\lib\site-packages\torch\autograd\function.py in apply(self, *args)
     87     def apply(self, *args):
     88         # _forward_cls is defined by derived class
---> 89         return self._forward_cls.backward(self, *args)  # type: ignore
     90 
     91 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in backward(ctx, dy)
    175         if dy.is_cuda:
    176             cuda_device = dy.get_device()
--> 177         vjp = dy.view(1, -1) @ ctx.jacobian.apply(ctx, *ctx.saved_tensors).to(dy)
    178         vjp = torch.unbind(vjp.view(-1))
    179         return (None,) + tuple(vjp)

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in forward(ctx_, parent_ctx, *input_)
    123                 ctx_.dy = parent_ctx.dy
    124                 ctx_.save_for_backward(*input_)
--> 125                 jacobian = _evaluate_grad_matrix("jacobian")
    126                 return jacobian
    127 

~\anaconda3\envs\myenv\lib\site-packages\pennylane\interfaces\torch.py in _evaluate_grad_matrix(grad_matrix_fn)
    108             tape.set_parameters(ctx.all_params_unwrapped, trainable_only=False)
    109             grad_matrix = getattr(tape, grad_matrix_fn)(
--> 110                 device, params=ctx.args, **tape.jacobian_options
    111             )
    112             tape.set_parameters(ctx.all_params, trainable_only=False)

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\qubit_param_shift.py in jacobian(self, device, params, **options)
    122         self._append_evA_tape = True
    123         self._evA_result = None
--> 124         return super().jacobian(device, params, **options)
    125 
    126     def parameter_shift(self, idx, params, **options):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\jacobian_tape.py in jacobian(self, device, params, **options)
    514 
    515         # perform gradient method validation
--> 516         diff_methods = self._grad_method_validation(method)
    517 
    518         if not self._has_trainable_params(params, diff_methods):

~\anaconda3\envs\myenv\lib\site-packages\pennylane\tape\jacobian_tape.py in _grad_method_validation(self, method)
    198 
    199         if nondiff_params:
--> 200             raise ValueError(f"Cannot differentiate with respect to parameter(s) {nondiff_params}")
    201 
    202         numeric_params = {idx for idx, g in diff_methods.items() if g == "F"}

ValueError: Cannot differentiate with respect to parameter(s) {0}