Hello,
As I have been testing different devices, for eg, “default.qubit.torch” and “lightning.qubit”.
If I understand it correctly, default.qubit.torch is written purely in torch framework and lightning.qubit is probably written in C++ . So I am thinking which one could be faster if it can be parallelized. For lightning.qubit, I see that it parallelized the observable calculations on different GPUs but not on the data level(for eg, divice batch_size by number of GPUs). Then I tried nn.torch.parallel on default.qubit.torch as I think it should work, but it pops the error of inconsistency of data locations (some on GPU 1, some on GPU 2). So I just wonder if my understanding is correct, and why nn.torch.parallel does not work on default.qubit.torch. Thanks!
In the following I attached some information:
Name: PennyLane
Version: 0.34.0
Summary: PennyLane is a Python quantum machine learning library by Xanadu Inc.
Home-page: GitHub - PennyLaneAI/pennylane: PennyLane is a cross-platform Python library for differentiable programming of quantum computers. Train a quantum computer the same way as a neural network.
Author:
Author-email:
License: Apache License 2.0
Location: /home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages
Requires: appdirs, autograd, autoray, cachetools, networkx, numpy, pennylane-lightning, requests, rustworkx, scipy, semantic-version, toml, typing-extensions
Required-by: PennyLane-Lightning, PennyLane-Lightning-GPU
Platform info: Linux-5.15.133.1.amd64-smp-x86_64-with-glibc2.31
Python version: 3.10.12
Numpy version: 1.26.3
Scipy version: 1.11.4
Installed devices:
- lightning.qubit (PennyLane-Lightning-0.34.0)
- default.gaussian (PennyLane-0.34.0)
- default.mixed (PennyLane-0.34.0)
- default.qubit (PennyLane-0.34.0)
- default.qubit.autograd (PennyLane-0.34.0)
- default.qubit.jax (PennyLane-0.34.0)
- default.qubit.legacy (PennyLane-0.34.0)
- default.qubit.tf (PennyLane-0.34.0)
- default.qubit.torch (PennyLane-0.34.0)
- default.qutrit (PennyLane-0.34.0)
- null.qubit (PennyLane-0.34.0)
- lightning.gpu (PennyLane-Lightning-GPU-0.34.0)
cuda
Traceback (most recent call last):
File “/CT/QN3DScene/work/Hardware_test/test_qubit_default_torch.py”, line 118, in
loss_evaluated = loss(model(xs), ys)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1527, in _call_impl
return forward_call(*args, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py”, line 185, in forward
outputs = self.parallel_apply(replicas, inputs, module_kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py”, line 200, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py”, line 110, in parallel_apply
output.reraise()
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/_utils.py”, line 694, in reraise
raise exception
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py”, line 85, in _worker
output = module(*input, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1527, in _call_impl
return forward_call(*args, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/container.py”, line 215, in forward
input = module(input)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1527, in _call_impl
return forward_call(*args, **kwargs)
File “/CT/QN3DScene/work/Hardware_test/test_qubit_default_torch.py”, line 81, in forward
res = self.QNode(x, self.weights, self.cond)
File “/CT/QN3DScene/work/Hardware_test/test_qubit_default_torch.py”, line 77, in QNode
return qnode(inputs, weights, cond)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/qnode.py”, line 1039, in call
res = qml.execute(
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/interfaces/execution.py”, line 648, in execute
results = inner_execute(tapes)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/interfaces/execution.py”, line 261, in inner_execute
return cached_device_execution(tapes)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/interfaces/execution.py”, line 383, in wrapper
res = list(fn(tuple(execution_tapes.values()), **kwargs))
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/contextlib.py”, line 79, in inner
return func(*args, **kwds)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/_qubit_device.py”, line 459, in batch_execute
res = self.execute(circuit)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/devices/default_qubit_torch.py”, line 247, in execute
return super().execute(circuit, **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/_qubit_device.py”, line 277, in execute
self.apply(circuit.operations, rotations=self._get_diagonalizing_gates(circuit), **kwargs)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/devices/default_qubit_legacy.py”, line 296, in apply
self._state = self._apply_operation(self._state, operation)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/devices/default_qubit_legacy.py”, line 344, in _apply_operation
return self._apply_unitary_einsum(state, matrix, wires)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/pennylane/devices/default_qubit_legacy.py”, line 905, in _apply_unitary_einsum
return self._einsum(einsum_indices, mat, state)
File “/home/shuwang/miniconda3/envs/job_env/lib/python3.10/site-packages/torch/functional.py”, line 377, in einsum
return _VF.einsum(equation, operands) # type: ignore[attr-defined]
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)
And my test code:
import pennylane as qml
import torch
import pdb
import math
from sklearn.datasets import make_moons
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
import time
qml.about()
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
print(device)
torch.set_default_dtype(torch.float32)
X, y = make_moons(n_samples=3000, noise=0.1)
y = y[:, np.newaxis]
data_loader = torch.utils.data.DataLoader(
list(zip(X, y)), batch_size=200, shuffle=True, drop_last=True
)
class QuantumLayer(nn.Module):
def __init__(self, n_qubits, rep, gaussian_std):
super().__init__()
self.n_qubits = n_qubits
self.sim_dev = qml.device('default.qubit.torch', wires=n_qubits)
self.show_plot = True
self.weights = nn.Parameter(
torch.normal(torch.zeros(n_qubits, rep), torch.full((n_qubits, rep), gaussian_std)).to(device))
self.cond = nn.Parameter(
torch.normal(torch.zeros(n_qubits, n_qubits, rep), torch.full((n_qubits, n_qubits, rep), gaussian_std)).to(device))
def Y_rotations(self, params, var_qubits):
# Perform Y-rotations
for i in range(len(params)):
qml.RY(params[i], wires = var_qubits[i])
def conditional_full_entangle(self, weights, entangle_qbs):
n = len(entangle_qbs)
for i in range(0, n):
for j in range(n):
if j != i:
qml.CRY(phi = weights[i, j], wires = [i, j])
def QNode(self, inputs, weights, cond):
@qml.qnode(self.sim_dev, interface = 'torch', diff_method = 'backprop')
def qnode(inputs, weights, cond):
qml.templates.AngleEmbedding(inputs, wires=[0, 1])
# construct blocks
for i in range(weights.size()[-1]):
self.conditional_full_entangle(cond[:self.n_qubits,:,i], list(range(self.n_qubits)))
self.Y_rotations(weights[:self.n_qubits,i], list(range(self.n_qubits)))
return [qml.expval(qml.PauliZ(i)) for i in range(2)]
if self.show_plot == True:
fig, ax = qml.draw_mpl(qnode, decimals = 2)(inputs, weights, cond)
plt.show()
plt.savefig('Framework')
self.show_plot = False
return qnode(inputs, weights, cond)
def forward(self, x):
res = self.QNode(x, self.weights, self.cond)
if torch.numel(res[0]) == 1:
q_out = torch.stack(res).reshape(self.n_qubits, -1).T.float()
elif torch.numel(res[0]) != 1:
q_out = torch.cat(res).reshape(self.n_qubits, -1).T.float()
return q_out
q_layer = QuantumLayer(n_qubits = 2,
rep = 2,
gaussian_std = 0.1)
clayer_2 = torch.nn.Linear(2, 1)
layers = [q_layer, clayer_2]
model = torch.nn.Sequential(*layers)
model.to(device)
model = torch.nn.DataParallel(model, device_ids=[0, 1])
opt = torch.optim.SGD(model.parameters(), lr=0.2)
loss = torch.nn.L1Loss()
epochs = 6
for epoch in range(epochs):
running_loss = 0
for xs, ys in data_loader:
xs, ys = xs.to(device), ys.to(device)
opt.zero_grad()
loss_evaluated = loss(model(xs), ys)
loss_evaluated.backward()
opt.step()
running_loss += loss_evaluated
print(running_loss)
And another note: I also tested this on a classical NN which works. So I guess this should be the problem of the device default.qubit.torch