Running the following hybrid model with `diff_method='adjoint'`

and `diff_method='backprop'`

gives different results during the training.

```
import numpy as np
from sklearn.datasets import make_moons
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import pennylane as qml
import sys
from time import perf_counter
class Model(nn.Module):
def __init__(self, dev, diff_method="backprop"):
super().__init__()
self.cnet_in = self.cnet()
self.qcircuit = qml.qnode(dev, interface="torch",
diff_method=diff_method)(self.qnode)
weight_shape = {"weights":(2,)}
self.qlayer = qml.qnn.TorchLayer(self.qcircuit, weight_shape)
self.cnet_out = self.cnet()
def cnet(self):
layers = [nn.Linear(2,256), nn.ReLU(True), nn.Linear(256,2), nn.Tanh()]
return nn.Sequential(*layers)
def qnode(self, inputs, weights):
# Data encoding:
for x in range(len(inputs)):
qml.Hadamard(x)
qml.RZ(2.0 * inputs[x], wires=x)
# Trainable part:
qml.CNOT(wires=[0,1])
qml.RY(weights[0], wires=0)
qml.RY(weights[1], wires=1)
return [qml.expval(qml.PauliZ(wires=0)), qml.expval(qml.PauliZ(wires=1))]
def forward(self, x):
x1 = self.cnet_in(x)
x2 = self.qlayer(x1)
x_output = self.cnet_out(x2)
return x_output
def train(X, y_hot, dev_name, diff_method):
dev = qml.device(dev_name, wires=2, shots=None)
model = Model(dev, diff_method)
# Train the model
opt = torch.optim.Adam(model.parameters(), lr=0.01)
loss = torch.nn.L1Loss()
X = torch.tensor(X, requires_grad=False).float()
y_hot = y_hot.float()
batch_size = 5
batches = 200 // batch_size
data_loader = torch.utils.data.DataLoader(
list(zip(X, y_hot)), batch_size=batch_size, shuffle=True, drop_last=True
)
epochs = 6
for epoch in range(epochs):
running_loss = 0
for xs, ys in data_loader:
opt.zero_grad()
loss_evaluated = loss(model(xs), ys)
loss_evaluated.backward()
opt.step()
running_loss += loss_evaluated
avg_loss = running_loss / batches
print("Average loss over epoch {}: {:.10f}".format(epoch + 1, avg_loss))
y_pred = model(X)
predictions = torch.argmax(y_pred, axis=1).detach().numpy()
correct = [1 if p == p_true else 0 for p, p_true in zip(predictions, y)]
accuracy = sum(correct) / len(correct)
print(f"Accuracy: {accuracy * 100}%")
if __name__ == "__main__":
torch.manual_seed(42)
np.random.seed(42)
X, y = make_moons(n_samples=200, noise=0.1)
y_ = torch.unsqueeze(torch.tensor(y), 1) # used for one-hot encoded labels
y_hot = torch.scatter(torch.zeros((200, 2)), 1, y_, 1)
begin_time = perf_counter()
train(X, y_hot, str(sys.argv[1]), str(sys.argv[2]))
end_time = perf_counter()
runtime = end_time-begin_time
print(f'Runtime: {runtime:.2e} s or {(runtime/60):.2e} min.')
```

From the documentation, and the literature, I understand that the two differentiation methods are analytic and should be equivalent. Is there an explanation for the discrepancy I observe by running the above code. Here are the outputs for backpropagation and adjoint differentiation, respectively.

```
/work/vabelis/miniconda3/envs/ae_qml_pnl/lib/python3.8
/site-packages/torch/autograd/__init__.py:154:
UserWarning: Casting complex values to real discards
the imaginary part (Triggered internally at
/opt/conda/conda-bld/pytorch_1640811757556/work/
aten/src/ATen/native/Copy.cpp:244.)
Variable._execution_engine.run_backward(
Average loss over epoch 1: 0.3586139083
Average loss over epoch 2: 0.1699218154
Average loss over epoch 3: 0.0833731964
Average loss over epoch 4: 0.1283999979
Average loss over epoch 5: 0.0720961764
Average loss over epoch 6: 0.0696858093
Accuracy: 98.5%
Runtime: 7.94e+00 s or 1.32e-01 min.
```

```
Average loss over epoch 1: 0.3586135507
Average loss over epoch 2: 0.1701160818
Average loss over epoch 3: 0.1003372073
Average loss over epoch 4: 0.0771262795
Average loss over epoch 5: 0.0591097102
Average loss over epoch 6: 0.0415146127
Accuracy: 100.0%
Runtime: 6.75e+00 s or 1.12e-01 min.
```

Versions:

- pytorch=1.10.2
- pennylane=0.21.0
- numpy=1.22.2

Thanks in advance for your answers!