I am still not able to make it work, and when it works it fails. one such code with steady angles in qml.ROT is this one. and i am not able to modify it.
This should be a standalone working code with moons dataset. Thanks in advance for your time and help!!
import pennylane as qml
import pandas as pd
from pennylane import numpy as np
from pennylane.templates.layers import StronglyEntanglingLayers
from pennylane.init import strong_ent_layers_uniform
from pennylane.optimize import GradientDescentOptimizer
from sklearn.datasets import make_moons , make_circles
from sklearn.preprocessing import StandardScaler , minmax_scale
from itertools import chain
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
blocks=1
layers=2
batch_size = 16
n_epochs = 20
test_size=0.2 #(train/test split)
learning_rate = 0.6
entangler = qml.CNOT
opt = GradientDescentOptimizer(stepsize=learning_rate)
n_qubits =2 
dev = qml.device("default.qubit", wires=n_qubits)
randomseed = 1
X, y = make_moons(n_samples=400, noise=0)
X = minmax_scale(X, feature_range=(-np.pi, np.pi))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
from sklearn.metrics import roc_auc_score
# quantum circuit
#
#
# draw random quantum node weights
theta_weights = [strong_ent_layers_uniform(layers, n_qubits, seed=randomseed+i+1) for i in range(blocks)]
theta_bias = 0.0
chi1=0.1
psi1=0.2
zet1=0.1
chi2=0.1
psi2=0.2
zet2=0.1
theta_init = (theta_weights, theta_bias,chi1,psi1,zet1,chi2,psi2,zet2) # initial weights
@qml.qnode(dev)
def circuit(weights):
    qml.Hadamard(wires=0)
    qml.Hadamard(wires=1)
    for i in range(blocks):
        qml.Rot(chi1, psi1, zet1, wires=0)
        qml.Rot(chi2, psi2, zet2, wires=1)
        StronglyEntanglingLayers(weights[i], wires = range(n_qubits),imprimitive=entangler)
    return qml.expval(qml.PauliZ(0))
# variational quantum classifier
def variational_classifier(theta):
    weights = theta[0]
    bias = theta[1]
    return circuit(weights) + bias + chi1 + psi1 + zet1 + chi1 + psi2 + zet2
# train the variational classifier
theta = theta_init
def cost(theta, X, expectations):
    e_predicted = \
        np.array([variational_classifier(theta) for x in X])
    loss = np.mean((e_predicted - expectations)**2)    
    return loss
def accuracy(labels, predictions):
    loss = 0
    for l, p in zip(labels, predictions):
        if abs(l - p) < 1e-5:
            loss = loss + 1
    loss = loss / len(labels)
    return loss
# calculate numbe of batches
batches = len(X_train) // batch_size
# split training data into batches
X_batches = np.array_split(np.arange(len(X_train)), batches)
lossplot = []
aucrocplot = []
accuracytrainplot = []
accuracytestplot = []
for it, batch_index in enumerate(chain(*(n_epochs * [X_batches]))):
    # Update the weights by one optimizer step
    batch_cost = \
        lambda theta: cost(theta, X_train[batch_index],y_train[batch_index])
    theta = opt.step(batch_cost, theta)
   
    
    if it % 10 == 0:
        expectations = np.array([variational_classifier(theta) for x in X_train])
        prob_class_one = (expectations + 1) / 2.0
        prob_class_one = pd.DataFrame.from_dict(prob_class_one)
        prob_class_one = prob_class_one.iloc[:, :]
        prob_class_one = prob_class_one[0].apply(lambda x: -1 if x <= 0.5 else 1)
        prob_class_onet = prob_class_one.to_numpy()
           
        expectations = np.array([variational_classifier(theta) for x in X_test])
        prob_class_one = (expectations + 1) / 2.0
        prob_class_one = pd.DataFrame.from_dict(prob_class_one)
        prob_class_one = prob_class_one.iloc[:, :]
        prob_class_one = prob_class_one[0].apply(lambda x: -1 if x <= 0.5 else 1)
        prob_class_one = prob_class_one.to_numpy()
        #
        #
        #--------- GRID PLOT START
        #
        #
        plt.figure()
        cm = plt.cm.RdBu
        fig= plt.figure(figsize=(5,5))
        xx, yy = np.meshgrid(np.linspace(-np.pi, np.pi, 15), np.linspace(-np.pi, np.pi, 15))
        X_grid = [np.array([x,y]) for x, y in zip(xx.flatten(), yy.flatten())]
        predictions_grid = np.array([variational_classifier(theta) for x in X_grid])
        zminus = (predictions_grid + 1.0) / 2.0
        zminus = predictions_grid
        Z=np.reshape(zminus, xx.shape)
# plot decision regions
        cnt = plt.contourf(xx, yy,Z, levels=np.arange(-1, 1., 0.1), cmap=cm, alpha=0.8, extend="both")
        plt.contour(xx, yy,Z, levels=[0.0], colors=("black",), linestyles=("--",), linewidths=(0.8,))
        plt.show()
        #
        #
        #--------- GRID PLOT END
        #
        #
    #print("Acc test",metrics.accuracy_score(y_test, prob_class_one))
    #print(metrics.confusion_matrix(y_test, prob_class_one))
        lossplot.append(cost(theta, X_train[batch_index], y_train[batch_index]))
        aucrocplot.append(roc_auc_score(y_test, prob_class_one))
        accuracytrainplot.append(metrics.accuracy_score(y_train, prob_class_onet))
        accuracytestplot.append(metrics.accuracy_score(y_test, prob_class_one))
#    print("It",it+1,"out of",len(X_batches) *n_epochs)
        print("It",it+1,"out of",len(X_batches) *n_epochs, "loss: ",cost(theta, X_train[batch_index], y_train[batch_index]),
        " : Acc train: ",round(metrics.accuracy_score(y_train, prob_class_onet),2),
        " : Acc test : ",round(metrics.accuracy_score(y_test, prob_class_one),3),
        " : Auc : ",round(roc_auc_score(y_test, prob_class_one),3)
        )
        if metrics.accuracy_score(y_train, prob_class_onet) >= 0.97:
                break
plt.plot(lossplot) #lets plot the second line
plt.ylabel('Loss')
plt.show()
plt.subplot(2,1,1)
plt.plot(accuracytrainplot,'r',label="train")
plt.plot(accuracytestplot,'b',label="test")
plt.ylabel('accuracy')
plt.legend()
plt.show()
plt.plot(aucrocplot) #lets plot the second line
plt.ylabel('auc roc score')
plt.show()