Hello,
I’m working on a circuit to train one weight per input. At first, I used default.qubit
, sending a batch of inputs and weights to be trained each circuit call. This is deemed to be slow (1.5s per epoch for batch size 512) and memory hungry, with a batch size of 512 requiring 29 GB. I switched then to lightning.gpu
following this discussion Backprop for Lightning.gpu. Sadly, this didn’t work as I keep getting the memory issue below for small batches (more than a batch size of 1). I suspect the device is still trying to create a state vector of 2^wires size, although they are not entangled. Is there any way to achieve true parallelisation in batches? What can be the origin of the issue with the high memory in default.qubit
? What’s the best way to achieve the fastest runtime using the setup below to train one weight per input?
Error:
Traceback (most recent call last):
File "/mnt/beegfs/work/user1/quantum_embeddings/snippet.py", line 70, in <module>
out = model(x)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/beegfs/work/user1/quantum_embeddings/snippet.py", line 62, in forward
return torch.stack(circuit(x, self.weights))
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/qnode.py", line 882, in __call__
return self._impl_call(*args, **kwargs)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/qnode.py", line 855, in _impl_call
res = qml.execute(
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/execution.py", line 244, in execute
results = run(tapes, device, config, inner_transform)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/run.py", line 332, in run
results = ml_execute(tapes, execute_fn, jpc, device=device)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/interfaces/torch.py", line 236, in execute
return ExecuteTapes.apply(kwargs, *parameters)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/interfaces/torch.py", line 89, in new_apply
flat_out = orig_apply(out_struct_holder, *inp)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/interfaces/torch.py", line 93, in new_forward
out = orig_fw(ctx, *inp)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/interfaces/torch.py", line 158, in forward
res = tuple(kwargs["execute_fn"](ctx.tapes))
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/jacobian_products.py", line 482, in execute_and_cache_jacobian
results, jac = self._dev_execute_and_compute_derivatives(tapes)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/workflow/jacobian_products.py", line 447, in _dev_execute_and_compute_derivatives
return self._device.execute_and_compute_derivatives(numpy_tapes, self._execution_config)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/devices/modifiers/simulator_tracking.py", line 95, in execute_and_compute_derivatives
return untracked_execute_and_compute_derivatives(self, circuits, execution_config)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/devices/modifiers/single_tape_support.py", line 60, in execute_and_compute_derivatives
results, jacs = batch_execute_and_compute_derivatives(self, circuits, execution_config)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/devices/modifiers/simulator_tracking.py", line 95, in execute_and_compute_derivatives
return untracked_execute_and_compute_derivatives(self, circuits, execution_config)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane/devices/modifiers/single_tape_support.py", line 60, in execute_and_compute_derivatives
results, jacs = batch_execute_and_compute_derivatives(self, circuits, execution_config)
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane_lightning/core/lightning_base.py", line 377, in execute_and_compute_derivatives
results = tuple(
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane_lightning/core/lightning_base.py", line 379, in <genexpr>
self.dynamic_wires_from_circuit(circuit),
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane_lightning/core/lightning_base.py", line 131, in dynamic_wires_from_circuit
self._statevector = self.LightningStateVector(
File "/storage/host/work/user1/miniconda3/envs/QuantumWordEmbedding/lib/python3.10/site-packages/pennylane_lightning/lightning_gpu/_state_vector.py", line 110, in __init__
self._qubit_state = self._state_dtype()(self.num_wires)
pennylane_lightning.lightning_gpu_ops.LightningException: [/project/pennylane_lightning/core/src/utils/cuda_utils/DataBuffer.hpp][Line:57][Method:DataBuffer]: Error in PennyLane Lightning: out of memory
Code snippet to reproduce the error:
import pennylane as qml
import torch
import torch.nn as nn
import torch.optim as optim
# Config
batch, dim, n_frozen, n_trainable, n_layers = 8, 100, 7, 7, 3
wires_per_sample = n_frozen + n_trainable + 1
total_wires = batch * wires_per_sample
# Device
dev = qml.device("lightning.gpu", wires=total_wires, batch_obs=3)
# Data (normalised)
x = torch.randn((batch, dim))
x = x / x.norm(dim=1, keepdim=True)
x = torch.nn.functional.pad(x, (0, 28)).to(torch.float32).cuda()
# Variational layer
def variational_layer(w, wires):
for l in range(w.shape[0]):
for i in range(w.shape[1]):
qml.RX(w[l, i, 0], wires=wires[i])
qml.RZ(w[l, i, 1], wires=wires[i])
for i in reversed(range(w.shape[1])):
for j in reversed(range(w.shape[1])):
if i != j:
qml.CRZ(w[l, i, 2 + j], wires=[wires[i], wires[j]])
for i in range(w.shape[1]):
qml.RX(w[l, i, -2], wires=wires[i])
qml.RZ(w[l, i, -1], wires=wires[i])
# QNode
@qml.qnode(dev, interface="torch", diff_method="adjoint")
def circuit(x, w):
ancilla_wires = []
for b in range(batch):
offset = b * wires_per_sample
f = range(offset, offset + n_frozen)
t = range(offset + n_frozen, offset + n_frozen + n_trainable)
anc = offset + n_frozen + n_trainable
qml.AmplitudeEmbedding(x[b], wires=f, normalize=False)
variational_layer(w[b], t)
qml.Hadamard(wires=anc)
for a, b_ in zip(f, t):
qml.CSWAP(wires=[anc, a, b_])
qml.Hadamard(wires=anc)
ancilla_wires.append(anc)
return [qml.expval(qml.PauliZ(w)) for w in ancilla_wires]
# Torch model
class QuantumModel(nn.Module):
def __init__(self):
super().__init__()
shape = (n_layers, n_trainable, n_trainable + 3)
self.weights = nn.Parameter(torch.rand((batch, *shape), device="cuda") * torch.pi)
def forward(self, x):
return torch.stack(circuit(x, self.weights))
# Training loop
model = QuantumModel().cuda()
opt = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(5):
opt.zero_grad()
out = model(x)
loss = ((1 - (out + 1) / 2).mean())
loss.backward()
opt.step()
print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")