Whats the problem of my code implementing expetation value of a shot-based device?

Hi, I was doing some research on ma-QAOA. I was trying to speed up process of implementing expetation value of a shot-based environment using multinomial in torch like this:

import pennylane as qml
import torch
import numpy as np
import os, pandas as pd, random, gc
from collections import defaultdict
from typing import List, Tuple, Dict

# -------------------- 基本設定 --------------------
cuda_id = '0'
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id
torch.set_default_dtype(torch.float64)
device = torch.device("cuda")

N = 12
m = 2
file_name = "maQAOA"
p = 1
PI = torch.pi

# === 初始化與優化設定 ===
ADAM_LR = 0.01
WINDOW = 100
TOL = 1e-2
MAX_EPOCH = 500
SHOTS = 2**10

# --------------------------------------------------
def generate_case(seed=42):
    def generate_bbv_graph(
        N: int,
        m: int,
        seed: int | None = None,
        m0: int | None = None,
        w0: float = 1.0,
        delta: float = 1.0
    ) -> Tuple[List[Tuple[int, int]], List[float], List[float]]:
        rng = random.Random(seed)
        adj: Dict[int, Dict[int, float]] = defaultdict(dict)
        strength = [0.0] * N

        if m0 is None:
            m0 = m
        for i in range(m0):
            for j in range(i + 1, m0):
                adj[i][j] = w0
                adj[j][i] = w0
                strength[i] += w0
                strength[j] += w0

        def weighted_sample_without_replacement(pop_indices: List[int], weights: List[float], k: int) -> List[int]:
            if sum(weights) <= 0:
                return rng.sample(pop_indices, k)
            chosen = []
            pool = pop_indices[:]
            w = weights[:]
            for _ in range(k):
                pick = rng.choices(pool, weights=w, k=1)[0]
                idx = pool.index(pick)
                chosen.append(pick)
                pool.pop(idx); w.pop(idx)
            return chosen

        for new_v in range(m0, N):
            candidates = list(range(new_v))
            weights_s = [strength[u] for u in candidates]
            targets = weighted_sample_without_replacement(candidates, weights_s, m)

            for u in targets:
                s_u_old = strength[u]
                if s_u_old > 0 and delta > 0:
                    increments = []
                    for v, w_uv in adj[u].items():
                        inc = delta * w_uv / s_u_old
                        if inc > 0:
                            increments.append((v, inc))
                    for v, inc in increments:
                        adj[u][v] += inc
                        adj[v][u] += inc
                        strength[u] += inc
                        strength[v] += inc

                if u not in adj[new_v]:
                    adj[new_v][u] = 0.0
                    adj[u][new_v] = 0.0
                adj[new_v][u] += w0
                adj[u][new_v] += w0
                strength[new_v] += w0
                strength[u]     += w0

        edges = []
        weights = []
        for u in range(N):
            for v, w in adj[u].items():
                if u < v:
                    edges.append((u, v))
                    weights.append(w)
        return edges, weights, strength

    edges, weights, deg_w = generate_bbv_graph(N=N, m=m, seed=seed, m0=None, w0=1.0, delta=1)
    weights = torch.tensor(weights, device=device, dtype=torch.float64)

    # --- 成本向量 ---
    num_states = 2 ** N
    states = torch.arange(num_states, device=device, dtype=torch.int64).unsqueeze(1)
    bits = ((states >> torch.arange(N, device=device)) & 1).bool()
    costs = torch.zeros(num_states, device=device, dtype=torch.float64)
    for idx, (i, j) in enumerate(edges):
        diff = bits[:, i] ^ bits[:, j]
        costs += weights[idx] * diff.to(torch.float64)

    print(f"Min: {costs.min():.4f}, Mean: {costs.mean():.4f}, Max: {costs.max():.4f}")

    # --- Hamiltonian ---
    coeffs = []
    observables = []
    for (i, j), w in zip(edges, weights.tolist()):
        coeffs.append(0.5 * w)                        
        observables.append(qml.Identity(0))
        coeffs.append(-0.5 * w)                       
        observables.append(qml.PauliZ(i) @ qml.PauliZ(j))
    cost_hamiltonian = qml.Hamiltonian(coeffs, observables)

    return weights, edges, deg_w, costs, cost_hamiltonian

# --------------------------------------------------
def init_params(p: int, N: int, M: int, seed: int = 42) -> np.ndarray:
    rng = random.Random(seed)
    kspace = list(range(-8, 9))
    params = [rng.choice(kspace) * (np.pi/8.0) for _ in range(p*(M+N))]
    return np.array(params, dtype=np.float64)

# ======================= 抽樣與手刻 parameter-shift =======================
def exoval_by_shots(state: torch.Tensor, shots: int, costs: torch.Tensor) -> torch.Tensor:
    probs = (state.abs() ** 2).to(torch.float64)
    idx = torch.multinomial(probs, num_samples=shots, replacement=True)
    return costs[idx].mean()

def build_qaoa_state_fn(N, p, edges, weights, dev=None):
    if dev is None:
        dev = qml.device("lightning.gpu", wires=N, shots=None)

    @qml.qnode(dev, interface='torch')
    def qaoa_circuit(params):
        M = len(edges)
        gammas = params[:p*M].view(p, M)
        betas  = params[p*M:].view(p, N)
        for i in range(N):
            qml.Hadamard(wires=i)
        for layer in range(p):
            for e_idx, (i, j) in enumerate(edges):
                qml.CNOT(wires=(i, j))
                qml.RZ(weights[e_idx] * gammas[layer, e_idx], wires=j)
                qml.CNOT(wires=(i, j))
            for i in range(N):
                qml.RX(betas[layer, i], wires=i)
        return qml.state()

    def qaoa_state(params: torch.Tensor) -> torch.Tensor:
        state = qaoa_circuit(params)
        if not torch.is_tensor(state):
            state = torch.tensor(state)
        if state.device.type != 'cuda':
            state = state.to(device)
        if state.dtype != torch.complex128:
            state = state.to(torch.complex128)
        return state
    return qaoa_state

@torch.no_grad()
def parameter_shift(qaoa_state_fn, params, shots, costs, N, M, p, weights):
    L = p*(M+N)
    grads = torch.empty(L, device=device, dtype=torch.float64)

    def f(_params: torch.Tensor) -> torch.Tensor:
        state = qaoa_state_fn(_params)
        return exoval_by_shots(state, shots, costs)

    for k in range(L):
        params_plus  = params.clone()
        params_minus = params.clone()
        if k < p*M:
            e_idx = k % M
            w = float(weights[e_idx].item())
            if w < 1e-12:
                grads[k] = 0.0
                continue
            delta = (np.pi/2.0) / w
            params_plus[k]  += delta
            params_minus[k] -= delta
            f_plus  = f(params_plus)
            f_minus = f(params_minus)
            grads[k] = 0.5 * w * (f_plus - f_minus)
        else:
            delta = np.pi/2.0
            params_plus[k]  += delta
            params_minus[k] -= delta
            f_plus  = f(params_plus)
            f_minus = f(params_minus)
            grads[k] = 0.5 * (f_plus - f_minus)
    return grads

# ======================= run_case =======================
def run_case(weights, edges, costs, cost_hamiltonian, init_params, instance_id, trial_id):
    M = len(edges)
    params = torch.tensor(init_params, device=device, dtype=torch.float64, requires_grad=False)
    params = torch.nn.Parameter(params, requires_grad=False)
    optimizer = torch.optim.Adam([params], lr=ADAM_LR)

    qaoa_state_fn = build_qaoa_state_fn(N, p, edges, weights)

    def estimate_objective(_params: torch.Tensor) -> torch.Tensor:
        state = qaoa_state_fn(_params)
        return exoval_by_shots(state, SHOTS, costs)

    loss_list = []
    best_para = params.detach().clone()
    max_exp = float('-inf')

    def is_converged(window=WINDOW, tol=TOL):
        if len(loss_list) < window:
            return False
        recent = loss_list[-window:]
        return (max(recent) - min(recent)) < tol * weights.sum().detach().item()

    for epoch in range(1, MAX_EPOCH+1):
        exp_est = estimate_objective(params.data)
        grads = parameter_shift(qaoa_state_fn, params.data, SHOTS, costs, N, M, p, weights)
        optimizer.zero_grad(set_to_none=True)
        params.grad = -grads
        optimizer.step()

        approx_ratio = (exp_est / costs.max()).item()
        loss_list.append(float(-exp_est))
        if exp_est.item() > max_exp:
            max_exp = exp_est.item()
            best_para = params.detach().clone()

        print(f"[instance:{instance_id}][trial:{trial_id}] Epoch {epoch} | approximate ratio {approx_ratio:.6f}")
        if is_converged(WINDOW, TOL):
            break

    print(f"[instance:{instance_id}][trial:{trial_id}] best approximate ratio:{max_exp/costs.max().item():.6f}")
    return [best_para.tolist()], [max_exp/costs.max().item()], [len(loss_list)], [loss_list]

# ======================= 主程式 =======================
for i in range(20):
    os.makedirs(f'{file_name}/{i}', exist_ok=True)
    weights, edges, deg_w, costs, cost_hamiltonian = generate_case(i)
    M = len(edges)
    for trial_id in range(100):
        if os.path.exists(f'{file_name}/{i}/best_params.csv'):
            data = pd.read_csv(f'{file_name}/{i}/best_params.csv', header=None)
            data = np.array(data[0])
            if len(data) != trial_id:
                continue
        params0 = init_params(p=p, N=N, M=M, seed=trial_id)
        best_params_list, best_approx_list, num_epoch_list, loss_list_list = run_case(
            weights, edges, costs, cost_hamiltonian, params0, i, trial_id
        )
        pd.DataFrame(best_params_list).to_csv(f'{file_name}/{i}/best_params.csv', mode='a', index=False, header=False)
        pd.DataFrame(best_approx_list).to_csv(f'{file_name}/{i}/best_approx.csv', mode='a', index=False, header=False)
        pd.DataFrame(num_epoch_list).to_csv(f'{file_name}/{i}/num_epoch.csv', mode='a', index=False, header=False)
        pd.DataFrame(loss_list_list).to_csv(f'{file_name}/{i}/loss_list.csv', mode='a', index=False, header=False)
    del weights, edges, deg_w, costs, cost_hamiltonian
    gc.collect()
    torch.cuda.empty_cache()

, and I got worse converge result and larger noise compared to code using pennylane API like this:

import pennylane as qml
import torch
import numpy as np
import os, pandas as pd, random, gc
from collections import defaultdict
from typing import List, Tuple, Dict

cuda_id = '0'

# === 基本設定 ===
N = 12
m = 2
file_name = "maQAOA"
p = 1
PI = torch.pi

os.environ['CUDA_VISIBLE_DEVICES'] = cuda_id
device = torch.device("cuda")

# === 初始化與優化設定 ===
ADAM_LR = 0.01
WINDOW = 100
TOL = 1e-2
MAX_EPOCH = 500
SHOTS = 1024




#--------------------------------------------------#
def generate_case(seed=42):
    def generate_bbv_graph(
        N: int,
        m: int,
        seed: int | None = None,
        m0: int | None = None,
        w0: float = 1.0,
        delta: float = 1.0
    ) -> Tuple[List[Tuple[int, int]], List[float], List[float], List[int]]:
        """
        產生 BBV (Barrat–Barthelemy–Vespignani) 加權網路。

        回傳:
        edges:   [(u,v), ...](u<v)
        weights: [w_uv, ...] 與 edges 對齊
        strength: [s_i]  每個節點的加權度數 s_i = sum_j w_ij
        """
        assert 1 <= m < N, "需滿足 1 <= m < N"
        if m0 is None:
            m0 = m
        assert m0 >= m and m0 <= N
        assert w0 >= 0 and delta >= 0
        rng = random.Random(seed)

        adj: Dict[int, Dict[int, float]] = defaultdict(dict)
        strength = [0.0] * N

        # 初始 K_{m0}
        for i in range(m0):
            for j in range(i + 1, m0):
                adj[i][j] = w0
                adj[j][i] = w0
                strength[i] += w0
                strength[j] += w0

        def weighted_sample_without_replacement(pop_indices: List[int], weights: List[float], k: int) -> List[int]:
            if sum(weights) <= 0:
                return rng.sample(pop_indices, k)
            chosen = []
            pool = pop_indices[:]
            w = weights[:]
            for _ in range(k):
                pick = rng.choices(pool, weights=w, k=1)[0]
                idx = pool.index(pick)
                chosen.append(pick)
                pool.pop(idx); w.pop(idx)
            return chosen

        for new_v in range(m0, N):
            candidates = list(range(new_v))
            weights_s = [strength[u] for u in candidates]
            targets = weighted_sample_without_replacement(candidates, weights_s, m)

            for u in targets:
                s_u_old = strength[u]
                if s_u_old > 0 and delta > 0:
                    increments = []
                    for v, w_uv in adj[u].items():
                        inc = delta * w_uv / s_u_old
                        if inc > 0:
                            increments.append((v, inc))
                    for v, inc in increments:
                        adj[u][v] += inc
                        adj[v][u] += inc
                        strength[u] += inc
                        strength[v] += inc

                if u not in adj[new_v]:
                    adj[new_v][u] = 0.0
                    adj[u][new_v] = 0.0
                adj[new_v][u] += w0
                adj[u][new_v] += w0
                strength[new_v] += w0
                strength[u]     += w0

        edges = []
        weights = []
        for u in range(N):
            for v, w in adj[u].items():
                if u < v:
                    edges.append((u, v))
                    weights.append(w)
        return edges, weights, strength

    edges, weights, deg_w = generate_bbv_graph(
        N=N, m=m, seed=seed, m0=None, w0=1.0, delta=1
    )
    weights = torch.tensor(weights, device=device, dtype=torch.float32)

    # --- 成本向量(為了計算 MaxCut 最佳值,用暴力法) ---
    num_states = 2 ** N
    states = torch.arange(num_states, device=device).unsqueeze(1)
    bits = ((states >> torch.arange(N, device=device)) & 1).bool()  # LSB->MSB
    costs = torch.zeros(num_states, device=device, dtype=torch.float32)
    for idx, (i, j) in enumerate(edges):
        diff = bits[:, i] ^ bits[:, j]
        costs += weights[idx] * diff.float()

    print(f"Min: {costs.min():.4f}, Mean: {costs.mean():.4f}, Max: {costs.max():.4f}")

    # --- 也建立 expval 用的 Hamiltonian(常數 + ZZ 形式) ---
    coeffs = []
    observables = []
    for (i, j), w in zip(edges, weights.tolist()):
        coeffs.append(0.5 * w)                        # 常數項 * I
        observables.append(qml.Identity(0))
        coeffs.append(-0.5 * w)                       # -0.5 w Z_i Z_j
        observables.append(qml.PauliZ(i) @ qml.PauliZ(j))
    cost_hamiltonian = qml.Hamiltonian(coeffs, observables)

    return weights, edges, deg_w, costs, cost_hamiltonian

#--------------------------------------------------#
def init_params(p: int, N: int, M: int,
                            seed: int = 42) -> np.ndarray:
    rng = random.Random(seed)
    kspace = list(range(-8, 9))

    params = [rng.choice(kspace)*(np.pi/8.0) for _ in range(p*(M+N))]
    
    return np.array(params, dtype=np.float32)

#--------------------------------------------------#
def run_case(weights, edges, costs, cost_hamiltonian, init_params, instance_id, trial_id):
    M = len(edges)
    dev = qml.device("lightning.gpu", wires=N, shots=SHOTS)
    params = torch.nn.Parameter(torch.tensor(init_params, device=device), requires_grad=True)

    # === ma-QAOA 電路 ===
    @qml.qnode(dev, interface='torch', diff_method='parameter-shift')
    def qaoa_circuit(params):
        # 參數切片:每層 [gammas(M), betas(N)]
        gammas = params[:p*M].view(p, M)
        betas  = params[p*M:].view(p, N)

        # 均勻疊加
        for i in range(N):
            qml.Hadamard(wires=i)

        for layer in range(p):
            # Cost: 每條邊有自己的 γ_{layer,e}
            for e_idx, (i, j) in enumerate(edges):
                # CNOT - RZ - CNOT 等效於 exp(-i * (theta/2) * Z_i Z_j)
                # 這裡沿用你的寫法:theta = w_e * gamma_{l,e}
                qml.CNOT(wires=(i, j))
                qml.RZ(weights[e_idx] * gammas[layer, e_idx], wires=j)
                qml.CNOT(wires=(i, j))
            # Mixer: 每個頂點有自己的 β_{layer,v}
            for i in range(N):
                qml.RX(betas[layer, i], wires=i)
        return qml.expval(cost_hamiltonian)

    def loss_fn(params):
        # 最大化期望值 => 最小化負值
        return -qaoa_circuit(params)

    optimizer = torch.optim.Adam([params], lr=ADAM_LR)
    loss_list = []
    best_para = params
    max_exp = float('-inf')

    def is_converged(window=WINDOW, tol=TOL):
        if len(loss_list) < window:
            return False
        recent = loss_list[-window:]
        return (max(recent) - min(recent)) < tol * weights.sum().detach().item()

    for epoch in range(1, MAX_EPOCH+1):
        optimizer.zero_grad()
        loss = loss_fn(params)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        if -loss_val > max_exp:
            max_exp = -loss_val
            best_para = params.detach().clone()

        loss_list.append(loss_val)
        print(f"[instance:{instance_id}][trial:{trial_id}] Epoch {epoch} | approximate ratio {-loss_val/costs.max():.6f}")

        if is_converged(WINDOW, TOL):
            break

    print(f"[instance:{instance_id}][trial:{trial_id}] best approximate ratio:{max_exp/costs.max()}")

    return [best_para.tolist()], [max_exp/costs.max().tolist()], [len(loss_list)], [loss_list]

#--------------------------------------------------#

for i in range(20):

    os.makedirs(f'{file_name}/{i}', exist_ok=True)

    weights, edges, deg_w, costs, cost_hamiltonian = generate_case(i)
    M = len(edges)

    for trial_id in range(100):
        
        if os.path.exists(f'{file_name}/{i}/best_params.csv'):
            data = pd.read_csv(f'{file_name}/{i}/best_params.csv', header=None)
            data = np.array(data[0])
            if len(data)!=trial_id:
                continue

        params = init_params(
            p=p, N=N, M=M,
            seed=trial_id
        )
        
        best_params_list, best_approx_list, num_epoch_list, loss_list_list = run_case(
            weights, edges, costs, cost_hamiltonian, params, i, trial_id
        )
        
        pd.DataFrame(best_params_list).to_csv(f'{file_name}/{i}/best_params.csv', mode='a', index=False, header=False)
        pd.DataFrame(best_approx_list).to_csv(f'{file_name}/{i}/best_approx.csv', mode='a', index=False, header=False)
        pd.DataFrame(num_epoch_list).to_csv(f'{file_name}/{i}/num_epoch.csv', mode='a', index=False, header=False)
        pd.DataFrame(loss_list_list).to_csv(f'{file_name}/{i}/loss_list.csv', mode='a', index=False, header=False)

    del weights, edges, deg_w, costs, cost_hamiltonian
    gc.collect()
    torch.cuda.empty_cache()

can someone help me, thanks ~ !

I think the main problem is about this

def exoval_by_shots(state: torch.Tensor, shots: int, costs: torch.Tensor) -> torch.Tensor:
    probs = (state.abs() ** 2).to(torch.float64)
    idx = torch.multinomial(probs, num_samples=shots, replacement=True)
    return costs.to(torch.float64)[idx].mean()

Do anyone knows its difference form behavior of qml.expval with shots? Thanks!

Hi @jeffreylin0909 , welcome to the Forum!

Your code is rather complex so I’ll focus on your question regarding qml.expval with shots.

Let’s take the following minimal reproducible example:

import pennylane as qml

dev = qml.device("lightning.qubit", wires=1)

@qml.qnode(dev)
def circuit():
  qml.Hadamard(wires=0)
  return qml.expval(qml.Z(wires=0))

print(circuit())
>>> 0.0

In the example above we have a Hadamard gate, which puts the qubit in the |+\rangle state, and we’re measuring the expectation value of the PauliZ observable on that qubit.
The eigenstates of the PauliZ observable are the states |0\rangle and |1\rangle, with their corresponding eigenvalues being +1 and -1. So if we were to measure this qubit once, the qubit would collapse to one of the two eigenstates of PauliZ, and we would either get a +1 or a -1, depending on which eigenstate the state collapsed to. E.g. if we measured this qubit just once, it might collapse to the state |0\rangle and we would measure a +1 since this is the eigenvalue corresponding to that state.

In the example above we don’t specify the number of shots, so the expectation value is calculated analytically. This is equivalent to having “infinite” shots. We know that the probability of the state collapsing in each eigenstate is 50% in this case so the expectation value is just 50\% \times (+1) + 50\% \times (-1) =0. This is the result in absence of “shot noise” which is just the effect of statistics (not physical noise).

You can learn more about measurements, including expectation values, in the PennyLane Codebook.

In contrast to the example above, the example below uses only three shots:

dev = qml.device("lightning.qubit", wires=1, shots=3)

@qml.qnode(dev)
def circuit():
  qml.Hadamard(wires=0)
  return qml.expval(qml.Z(wires=0))

print(circuit())
>>> -0.3333333333333333

The exact answer you get will vary every time you run it because we’re no longer using the analytical probabilities to determine the expectation value, we’re instead getting the average over the results obtained for the specific number of shots.
In the case above for example we can predict that out of the three shots two resulted in a -1 measurement and one resulted in a +1 measurement. The average over the three measurements then resulted in an expectation value of -0.33 for this specific case.

I hope this clarifies things. I strongly recommend that you go through the PennyLane Codebook. Start with Introduction to Quantum Computing and work your way through.

I hope this helps!

Thanks for replying!

But what I’m asking is that, I think qml.expval with shots:

dev = qml.device("lightning.gpu", wires=N, shots=1024)
@qml.qnode(dev, interface='torch', diff_method='parameter-shift')
    def circuit(params):
        ... (implemention of quantum circuit)
        return qml.expval(cost_hamiltonian)

print(circuit(params))

act the same as following code I implemented:

dev = qml.device("lightning.gpu", wires=N, shots=None)
@qml.qnode(dev, interface='torch', diff_method='parameter-shift')
    def circuit(params):
        ... (implemention of quantum circuit)
        return qml.state()

def exoval_by_shots(state, shots, costs):
    probs = (state.abs() ** 2).to(torch.float64)
    idx = torch.multinomial(probs, num_samples=shots, replacement=True)
    return costs.to(torch.float64)[idx].mean()

print(exoval_by_shots(circuit(params), 1024, costs))

where my “cost_hamiltonian“ is a qml.Hamiltonian object with eigenvalues and all pauli-Z-basis-eigenstate, and “costs“ is a tensor with lengh of 2**n, meaning each projection energy on computational basis states.

However, the latter seems noisier, do you know what happened? (I know these two implemented in defferent ways, the former is implemented with C++ and the latter is implemented with torch.multinomial, but they should be same distributed theoretically.)

Thanks !!!

Upd:

I think I just figured out what happened, the problem is that qml.state() give different order of computational basis than I expected. I expeted that qml.state() give the order of [00, 01, 10, 11], but it actually give [00, 10, 01, 11]. after some endianess transform, I get same distribution with qml.sample() and qml.expval().

Thanks for replying!

Hi @jeffreylin0909 ,

It’s actually the opposite.

PennyLane uses big endian ordering (opposite to Qiskit’s little endian ordering). If you see the docs for qml.state() you see a good example:

dev = qml.device("default.qubit", wires=2)

@qml.qnode(dev)
def circuit():
    qml.Hadamard(wires=1)
    return qml.state()

circuit()
>>> array([0.70710678+0.j, 0.70710678+0.j, 0.        +0.j, 0.        +0.j])

The returned array shows the probability amplitudes in lexicographic order (|00⟩, |01⟩,|10⟩, |11⟩). Qubit 0 corresponds to the topmost qubit on a circuit diagram, and the leftmost qubit in this list. This is the most significant bit.
Qubit 1 on the other hand, which is the one where we’re applying the Hadamard, is the least significant bit. This means that it’s lower in the circuit diagram, and the rightmost qubit in the list (|00⟩, |01⟩,|10⟩, |11⟩).

Since we’re applying no gates on qubit 0 and a Hadamard on qubit 1 we expect the probability amplitudes to be \frac{1}{\sqrt 2} in both |00⟩ and |01⟩ and 0 in both |10⟩ and |11⟩.
If you look at the output of the circuit this is exactly what we get.

Regarding your code examples I understand the first one but I really don’t understand the second one. Overall I’d suggest sticking to the code with qml.expval since PennyLane has been optimized to calculate it with and without shots. It will generally be more inefficient if you try to calculate it yourself. Of course if you want to do it as a check you can, and you can also look at the implementation in the code. If you want to see it let me know and I can share some links to the relevant files. In the meantime, the code below should work for you (feel free to use lightning.gpu if you have over 20 qubits).

import pennylane as qml
import torch

N = 2
dev = qml.device("lightning.qubit", wires=N, shots=1024)
@qml.qnode(dev)
def circuit(params):
    qml.RX(params,wires=0)
    return qml.expval(qml.Z(wires=0))

params = torch.tensor(2.0)
print(circuit(params))

Some additional notes:

  • Are you sure you need diff_method='parameter-shift'? This will make your computation much slower.
  • You don’t need to specify interface='torch'. PennyLane will automatically choose the interface based on your parameter types.
  • Your circuit definitions are indented with respect to the qnode decorator. This may cause problems.
  • You don’t need to specify .to(torch.float64). The output of a qnode is already a Torch tensor as long as its parameters are also Torch tensors.