Pennylane and Pytorch running on GPU

I have two separate codes – one with classical machine learning (nothing to do with pennylane) and one with quantum ml (below). The only difference between the two is that the qml has additional pennylane code in the DQN class. Running the classical ml code runs without a problem on the GPU but when I run the qml code I get an error. Here is part of the code:

import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from torch.nn.functional import relu, sigmoid
import pennylane as qml
import time

out_dim = 2  # output dimension of model
wires = 1  # this is the width of the quantum element
n_quantum_layers = 2  # this is the depth of the quantum element


def layer(inputs, w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10):
    qml.templates.SqueezingEmbedding(inputs, wires=range(wires))
    qml.templates.CVNeuralNetLayers(w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10,
                                    wires=range(wires))
    return [qml.expval(qml.X(wires=i)) for i in range(wires)]


class DQN(nn.Module):

    def __init__(self, img_height, img_width):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(in_features=img_height * img_width * 3, out_features=12)
        self.fc2 = nn.Linear(in_features=12, out_features=8)
       # self.fc3 = nn.Linear(in_features=10, out_features=8)
        self.clayer_in = torch.nn.Linear(in_features=8, out_features=wires)
        self.clayer_out = torch.nn.Linear(wires, out_dim)

        dev = qml.device('strawberryfields.fock', wires=wires, cutoff_dim=3)
        self.layer_qnode = qml.QNode(layer, dev)

        weights = qml.init.cvqnn_layers_all(n_quantum_layers, wires)
        weight_shapes = {"w{}".format(i): w.shape for i, w in enumerate(weights)}
        
        self.qlayer = qml.qnn.TorchLayer(self.layer_qnode, weight_shapes)

    def forward(self, t):
        t = self.flatten(t)
        t = self.fc1(t)
        t = self.fc2(t)
       # t = self.fc3(t)
        t = self.clayer_in(t)
        t = self.qlayer(t)
        t = self.clayer_out(t)
        t = t.sigmoid()
        return t





#A lot of code between these two parts is left 
#out for the sake of brevity and necessity



batch_size = 128
gamma = 0.999
eps_start = 1
eps_end = 0.01
eps_decay = 0.0005
target_update = 10
memory_size = 500000
lr_start = 0.01
lr_end = 0.00001
lr_decay = 0.00009
num_episodes = 1000 # run for more episodes for better results

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

em = CartPoleEnvManager(device)
strategy = EpsilonGreedyStrategy(eps_start, eps_end, eps_decay)
agent = Agent(strategy, em.num_actions_available(), device)
memory = ReplayMemory(memory_size)
#learning_rate = LearningRate(lr_start,lr_end,lr_decay)
#learn = lr(learning_rate)

policy_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net = DQN(em.get_screen_height(), em.get_screen_width()).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval() #tells pytorch that target_net is only used for inference, not training
optimizer = optim.Adam(params=policy_net.parameters(), lr=0.01)

i = 0
episode_durations = []
for episode in range(num_episodes): #iterate over each episode
    program_starts = time.time()
    em.reset()
    state = em.get_state()
    
    for timestep in count():
        action = agent.select_action(state, policy_net)
        reward = em.take_action(action)
        next_state = em.get_state()
        memory.push(Experience(state, action, next_state, reward))
        state = next_state
        #i+=1
        #print(i)
        if memory.can_provide_sample(batch_size):
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.9)
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = extract_tensors(experiences)
            
            current_q_values = QValues.get_current(policy_net, states, actions)
            next_q_values = QValues.get_next(target_net, next_states) #will get the max qvalues of the next state, q values of next state are used via next state
            target_q_values = (next_q_values * gamma) + rewards

            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad() # sets the gradiesnt of all weights n biases in policy_net to zero
            loss.backward() #computes gradient of loss with respect to all weights n biases in the policy net
            optimizer.step() # updates the weights n biases with the gradients that were computed form loss.backwards
            scheduler.step()
        if em.done:
            episode_durations.append(timestep)
            plot(episode_durations, 100)
            break
    if episode % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict()) 
    now = time.time()
    print("Episode hat {0} Sekunden gedauert".format(now - program_starts))     
        
em.close()

And when running the code, the following error appears:

Traceback (most recent call last):
  File "qdqn.py", line 328, in <module>
    loss.backward() #computes gradient of loss with respect to all weights n biases in the policy net
  File "/home/ubuntu/anaconda3/envs/gymm/lib/python3.8/site-packages/torch/tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/ubuntu/anaconda3/envs/gymm/lib/python3.8/site-packages/torch/autograd/__init__.py", line 98, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Expected object of device type cuda but got device type cpu for argument #2 'mat2' in call to _th_mm

Any insight is greatly appreciated.

Hi @Shawn!

For now it would be suggested to use the CPU with the TorchLayer from PennyLane. Although some examples might work, complete GPU support for the TorchLayer is something we’d have to further explore.

Recommend keeping an eye on the following relevant thread too:

1 Like

Hi @antalszava Many thanks for the reply. Yea that is a bummer because with a CPU the code will take too long to finish (like a year). Does Pennylane have a working GPU option with Tensorflow?

Hi @Shawn,

Note that it should be possible to have a QNode using the PyTorch interface that runs on GPU. It is the addition of using TorchLayer, i.e., converting the QNode to a torch.nn layer, that is more of an open question for running on GPU. This should also be the same with the TensorFlow interface and KerasLayer.

On the other hand, it’s also not clear that running on GPU will provide a speed up. The strawberryfields.fock device you are using works with the NumPy-based Fock backend of Strawberry Fields. Instead, I’d only be expecting a GPU-based speedup when using a device that is designed to interact with GPUs. One example in the qubit setting may be the default.qubit.tf device, which is written in TensorFlow and might be more amenable to running on GPU. Although, right now I’d say that GPU support/speedups are still on the to-do list in PennyLane.

Thanks,
Tom

Hi @Tom_Bromley thanks for the insight. Hm that’s unfortunate. I checked out the TorchLayer class – is it an open question if it could by chance run on a GPU? If so, is it in the pipeline to make the class work with certainty with GPUs?

To your last comment, will there be an option for GPU-based devices for continuous-variable systems soon? It just seems odd that with the non pennylane code, the code ran with CPU and GPU fine but the pennylane code takes a very long time to run with CPU.

Thanks again for the time and info.

Hey @Shawn,

I checked out the TorchLayer class – is it an open question if it could by chance run on a GPU?

Right now I’d say that either it doesn’t work on GPU, or it doesn’t yet work reliably on GPU and would suggest not using them. We haven’t prioritized GPU support yet since we were focusing on the core feature, so we’re really relying on users such as yourself and @mamadpierre in this post for feedback.

If so, is it in the pipeline to make the class work with certainty with GPUs?

It’s good to know that there’s some interest for this feature and we can add it to our to-do list. I can’t make any promises on when it will be available. As a side note if you’re interested, we always welcome contributors and this might be a nice and well specified thing to add.

To your last comment, will there be an option for GPU-based devices for continuous-variable systems soon? It just seems odd that with the non pennylane code, the code ran with CPU and GPU fine but the pennylane code takes a very long time to run with CPU.

I’d say the slow down is due to:

  • Fundamentally, we’re simulating a quantum system which in the strawberryfields.fock simulator scales exponentially with the number of modes. Unfortunately this is not something we can really get around with simulators, but a nice motivation for using hardware!

  • The code could be more optimized: we’re gradually adding performance improvements to elements of the code. For example, we’ve been implementing gates more efficiently in Strawberry Fields. I think there’s probably still room to optimize the performance running on CPUs before we concentrate on using GPUs.

Thanks!

1 Like

Hi @Tom_Bromley many thanks for the insight! I would be very interested in helping build this out. How would one get started? Make an issue and start from there? I am guessing I would have to learn a lot of the code from the packages first.

Thanks @Shawn, that would be cool!

We have a guide here on the forum for contributing to PennyLane. However in this case, I think you’re right that it would be good to first get a handle on the errors that we get when trying to run TorchLayer on GPU and report them as an issue on the PennyLane GitHub. We can then decide from there whether there is a reasonable fix.

Ok I made a new issue here: https://github.com/XanaduAI/pennylane/issues/709

Thanks @Shawn for this!

Hi,

what is the situation on GPU support right now? I don’t use qml.qnn.TorchLayer but get a similar error.

import numpy as np
import torch
import torch.nn as nn
import pennylane as qml

class QuanumLayer(nn.Module):
    def __init__(self, n_qubits, n_layers):
        super().__init__()
        self.n_qubits = n_qubits
        self.n_layers = n_layers
        self.weight = nn.Parameter(torch.randn((n_layers, n_qubits*(n_qubits-1)))).to(device)
        self.sim_dev = qml.device('default.qubit', wires=n_qubits)

    def entanglement_layer(self, qubits):
        n = len(qubits)
        for i in range(0, n):
            qml.CNOT(wires=[qubits[i], qubits[(i+1)%n]])
        
    def rotation_embedding(self, params, qubits):
        # parametrized ry, rz rotations
        n = len(qubits)
        for i,q in enumerate(qubits):
            qml.RY(params[i], wires=q)
        for i,q in enumerate(qubits):
            qml.RZ(params[i+n], wires=q)
        for i,q in enumerate(qubits):
            qml.RY(params[i+2*n], wires=q)
        self.entanglement_layer(qubits)

    def variation_layer(self, params, qubits):
        # parametrized ry, rz rotations
        n = len(qubits)
        count = 0
        for i,q in enumerate(qubits):
            for j,r in enumerate(qubits):
                if (r != i):
                    qml.CRY(params[count], wires=[r, i])
                    count +=1
                
    def QNode(self, inputs, weights):
        if (len(inputs) != self.n_qubits*3):
            raise IndexError("Inputvector must have length 3*qubits")
        
        @qml.qnode(self.sim_dev, interface="torch")
        def qnode(inputs, weights):
            self.rotation_embedding(inputs, range(self.n_qubits))
            #variational circuit
            measureWires = range(self.n_qubits)
            for w in weights:
                self.variation_layer(w, measureWires)
            #measure
            return [qml.expval(qml.PauliZ(i)) for i in measureWires]
        return qnode(inputs, weights).to(device)

    def forward(self, input_features):
        inputs = input_features*2*np.pi
        q_out = torch.Tensor(0, self.n_qubits).to(device)
        # Apply the quantum circuit to each element of the batch and append to q_out
        for elem in inputs:
            q_out_elem = self.QNode(elem, self.weight).float().unsqueeze(0)
            q_out = torch.cat((q_out, q_out_elem))
        return q_out

Using this class in network raises this error:

~\Hahn_schickard\quantencomputing\jupyter\pyTorch_utils.py in train(device, model, optimizer, criterion, epochs, training, testing)
     35             correct += (predicted == labels).float().sum()
     36             loss = criterion(outputs, labels)
---> 37             loss.backward()
     38             optimizer.step()
     39             batch_percentage = i*20//number_batches

~\anaconda3\envs\quantum\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    243                 create_graph=create_graph,
    244                 inputs=inputs)
--> 245         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    246 
    247     def register_hook(self, hook):

~\anaconda3\envs\quantum\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145     Variable._execution_engine.run_backward(
    146         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 147         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
    148 
    149 

~\anaconda3\envs\quantum\lib\site-packages\torch\autograd\function.py in apply(self, *args)
     87     def apply(self, *args):
     88         # _forward_cls is defined by derived class
---> 89         return self._forward_cls.backward(self, *args)  # type: ignore
     90 
     91 

~\anaconda3\envs\quantum\lib\site-packages\pennylane\interfaces\torch.py in backward(ctx, dy)
    173         """Implements the backwards pass QNode vector-Jacobian product"""
    174         ctx.dy = dy
--> 175         vjp = dy.view(1, -1) @ ctx.jacobian.apply(ctx, *ctx.saved_tensors)
    176         vjp = torch.unbind(vjp.view(-1))
    177         return (None,) + tuple(vjp)

RuntimeError: Tensor for argument #3 'mat2' is on CPU, but expected it to be on GPU (while checking arguments for addmm)

Hi @Daniel63656,

Just this week we’ve merged fix for this issue in the master branch of PennyLane. I recommend to try installing the development version (and please let us know if you are still running into trouble!)

Also just as a heads up, depending on your graphics card, you may experience (unrelated) errors of the form CUBLAS_STATUS_EXECUTION_FAILED. If you do, these should be solved by updating to pytorch 1.9

I upgraded pennylane via

 pip install pennylane -U

from 0.15.1 to 0.16.0

Error persists. Is this the merged version?

Hi @Daniel63656,

Version 0.16 is the current release version; the development version is 0.17. There are two ways to install it, the most straightforward is to use pip and grab the version directly from Github:

pip install git+https://github.com/PennyLaneAI/pennylane.git

The alternative is to actually clone the Github repo locally, and do pip install . from within the directory.

Thank you.
Now it works perfectly fine.

Fantastic, thanks for letting us know!

Hi,

I tried the following code which also gives errors. Could anyone help?

class QuantumLayer(nn.Module):
def __init__(self, n_qubits = 7, include_view = True):
    super().__init__()
    self.n_qubits = n_qubits
    self.sim_dev = qml.device('default.qubit', wires=n_qubits)
    self.include_view = include_view
    self.weight1 = nn.Parameter(torch.randn(4, 3))
    self.weight2 = nn.Parameter(torch.randn(3, 3))
    self.weight3 = nn.Parameter(torch.randn(4, 3))

def circular_entanglement_layer(self, entangling_qubits):
    # Circular entanglement
    if len(entangling_qubits) > self.n_qubits:
        raise Exception('Num of entangling qubits must be smaller than num of available qubits')
    n = len(entangling_qubits)
    for i in range(0, n):
        qml.CNOT(wires=[entangling_qubits[i], entangling_qubits[(i+1)%n]])

def rotation_embedding(self, x, encoding_qubits):
    # Perform rotation embedding
    # x: (batch_size, 3 + 3)
    if np.shape(x)[-1] != len(encoding_qubits):
        raise Exception('lengths of data to be encoded and encoding qubits must be equal')
    qml.AngleEmbedding(x, wires= encoding_qubits)

def variation_layer(self, params, var_qubits):
    if len(params) != len(var_qubits):
        raise Exception('lengths must be equal in variational layers')
    for i in range(len(params)):
        qml.Rot(params[i, 0], params[i, 1], params[i, 2], wires=var_qubits[i])

def QNode(self, inputs, weights1, weights2, weights3):

    @qml.qnode(self.sim_dev, interface = 'torch')
    def qnode(inputs, weights1, weights2, weights3):
        # inputs: (batch_size, 3 + 3), weights: (4, 3)
        self.rotation_embedding(inputs, list(range(3)) + list(range(3, 6)))
        self.variation_layer(weights1, list(range(4)))
        self.circular_entanglement_layer(list(range(4)))
        self.variation_layer(weights3, list(range(4)))
        self.circular_entanglement_layer(list(range(4)))

        if self.include_view:
          qml.CNOT(wires = [4, 0])
          qml.CNOT(wires = [4, 1])
          qml.CNOT(wires = [4, 2])
          qml.CNOT(wires = [5, 0])
          qml.CNOT(wires = [5, 1])
          qml.CNOT(wires = [5, 2])
          qml.CNOT(wires = [6, 0])
          qml.CNOT(wires = [6, 1])
          qml.CNOT(wires = [6, 2])
          self.variation_layer(weights2, list(range(3)))

        else:
          pass

        return [qml.expval(qml.PauliZ(i)) for i in range(4)] 

    return qnode(inputs, weights1, weights2, weights3)
    
def forward(self, chunk_pos, chunk_view):
    # chunk_pos: (batch_size, 3), chunk_view: (batch_size, 3)
    inputs = torch.cat((chunk_pos, chunk_view), dim = -1)

    for elem in inputs:
        pdb.set_trace()
        q_out_elem = self.QNode(elem, self.weight1, self.weight2, self.weight3).float().unsqueeze(0)
        q_out = torch.cat((q_out, q_out_elem))
    return q_out

But I got errors:

/usr/local/lib/python3.10/dist-packages/pennylane/devices/default_qubit.py in (.0)
473 if max_workers is None:
474 results = tuple(
→ 475 simulate(
476 c,
477 rng=self._rng,

/usr/local/lib/python3.10/dist-packages/pennylane/devices/qubit/simulate.py in simulate(circuit, rng, prng_key, debugger, interface)
267
268 “”"
→ 269 state, is_state_batched = get_final_state(circuit, debugger=debugger, interface=interface)
270 return measure_final_state(circuit, state, is_state_batched, rng=rng, prng_key=prng_key)

/usr/local/lib/python3.10/dist-packages/pennylane/devices/qubit/simulate.py in get_final_state(circuit, debugger, interface)
159 is_state_batched = bool(prep and prep.batch_size is not None)
160 for op in circuit.operations[bool(prep) :]:
→ 161 state = apply_operation(op, state, is_state_batched=is_state_batched, debugger=debugger)
162
163 # Handle postselection on mid-circuit measurements

/usr/lib/python3.10/functools.py in wrapper(*args, **kw)
887 ‘1 positional argument’)
888
→ 889 return dispatch(args[0].class)(*args, **kw)
890
891 funcname = getattr(func, ‘name’, ‘singledispatch function’)

/usr/local/lib/python3.10/dist-packages/pennylane/devices/qubit/apply_operation.py in apply_operation(op, state, is_state_batched, debugger)
196
197 “”"
→ 198 return _apply_operation_default(op, state, is_state_batched, debugger)
199
200

/usr/local/lib/python3.10/dist-packages/pennylane/devices/qubit/apply_operation.py in _apply_operation_default(op, state, is_state_batched, debugger)
206 and math.ndim(state) < EINSUM_STATE_WIRECOUNT_PERF_THRESHOLD
207 ) or (op.batch_size and is_state_batched):
→ 208 return apply_operation_einsum(op, state, is_state_batched=is_state_batched)
209 return apply_operation_tensordot(op, state, is_state_batched=is_state_batched)
210

/usr/local/lib/python3.10/dist-packages/pennylane/devices/qubit/apply_operation.py in apply_operation_einsum(op, state, is_state_batched)
98 reshaped_mat = math.reshape(mat, new_mat_shape)
99
→ 100 return math.einsum(einsum_indices, reshaped_mat, state)
101
102

/usr/local/lib/python3.10/dist-packages/pennylane/math/multi_dispatch.py in einsum(indices, like, optimize, *operands)
537 if like is None:
538 like = get_interface(*operands)
→ 539 operands = np.coerce(operands, like=like)
540 if optimize is None or like == “torch”:
541 # torch einsum doesn’t support the optimize keyword argument

/usr/local/lib/python3.10/dist-packages/autoray/autoray.py in do(fn, like, *args, **kwargs)
78 “”"
79 backend = choose_backend(fn, *args, like=like, **kwargs)
—> 80 return get_lib_fn(backend, fn)(*args, **kwargs)
81
82

/usr/local/lib/python3.10/dist-packages/pennylane/math/single_dispatch.py in _coerce_types_torch(tensors)
603 # GPU specific case
604 device_names = ", “.join(str(d) for d in device_set)
→ 605 raise RuntimeError(
606 f"Expected all tensors to be on the same device, but found at least two devices, {device_names}!”
607 )

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0, cpu!

Does someone know how to fix this?

Hey @Daniel_Wang! Welcome to the forum :sunglasses:

I can’t see anything obvious in your code example, but there are things missing for me to be able to reproduce what you’re getting. In any case, it looks like somewhere along the line your creating a torch tensor on a cpu device instead of a gpu device. You’ll need to do something like this on your cpu-using tensors:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
input_tensor = input_tensor.to(device)
output_tensor = output_tensor.to(device)

or you can set the default device like this (see here: torch.set_default_device — PyTorch 2.1 documentation)

>>> torch.tensor([1.2, 3]).device
device(type='cpu')
>>> torch.set_default_device('cuda')  # current device is 0
>>> torch.tensor([1.2, 3]).device
device(type='cuda', index=0)
>>> torch.set_default_device('cuda:1')
>>> torch.tensor([1.2, 3]).device
device(type='cuda', index=1)

Let me know if either of these help!

Hi,

Thank you for your info. I solved the problem by setting the device to default.qubit.torch and it works now.

The input tensor, output tensors and the model are all in GPU. It is just that using default.qubit always pops the error that something is inconsistent. Using default.qubit.torch solves it.

1 Like

Hi @Daniel_Wang! Please could you let us know which version of PennyLane you are using?
You can get this by copying here the output of qml.about().