Hello! I have trained my model on RTX A6000 and have a issue that: GPU just work 18% and memory usage of GPU was 27%, my model trained for NLP. Can you check for me a Quantum - LSTM model can’t train fast with RTX A6000 and my server ram was 224GB. I want to train model faster
My code :
# --- Model and Training Configuration ---
N_QUBITS_LSTM = 2 # Qubits for the QLSTM cells
N_QUBITS_ATTN = 2 # Qubits for the Attention gate
N_QLAYERS = 1
EMBEDDING_DIM = 50
HIDDEN_SIZE = 128
MAX_SEQ_LEN = 512
BATCH_SIZE = 1024
EPOCHS = 10
LEARNING_RATE = 0.01
DROPOUT_RATE = 0.5
NUM_WORKERS = 0
SAMPLE=10000
print("Loading GloVe word vectors...")
word2vec_model = api.load('glove-wiki-gigaword-50')
print("GloVe model loaded.")
DEVICE = torch.device("cuda")
print(f"✅ GPU found: {torch.cuda.get_device_name(0)}. Running on CUDA.")
def clean_text(text):
def text_to_vectors(texts, max_len=MAX_SEQ_LEN):
def get_data_loaders(dataset_path, sample_size=SAMPLE):
"""Loads data, preprocesses it, and returns PyTorch DataLoaders."""
df = pd.read_csv(dataset_path, nrows=sample_size)
df['review'] = df['review'].apply(clean_text)
labels = [1 if s == 'positive' else 0 for s in df['sentiment']]
reviews_vec = text_to_vectors(df['review'].tolist())
X = torch.tensor(reviews_vec, dtype=torch.float64)
y = torch.tensor(labels, dtype=torch.float64)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True, pin_memory=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=BATCH_SIZE,pin_memory=True,num_workers=NUM_WORKERS)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=BATCH_SIZE, pin_memory=True, num_workers=NUM_WORKERS)
print(f"Data split: Train={len(X_train)}, Validation={len(X_val)}, Test={len(X_test)}")
return train_loader, val_loader, test_loader
# --- Quantum Circuit Definitions ---
import math
dev_lstm = qml.device("lightning.gpu", wires=N_QUBITS_LSTM)
dev_attn = qml.device("lightning.gpu", wires=N_QUBITS_ATTN)
# Re-usable VQC function
def create_vqc(n_qubits, n_qlayers):
@qml.qnode(qml.device("lightning.gpu", wires=n_qubits), interface='torch', diff_method='adjoint')
def vqc(inputs, weights):
for i in range(n_qubits):
qml.Hadamard(wires=i)
qml.RZ(inputs[:, i], wires=i)
for layer_idx in range(n_qlayers):
for i in range(n_qubits):
qml.RY(weights[layer_idx, i], wires=i)
for i in range(n_qubits - 1):
qml.CNOT(wires=[i, i + 1])
return [qml.expval(qml.PauliZ(i)) for i in range(n_qubits)]
return vqc
class QuantumModel():
# --- QLSTM Cell ---
class QLSTM(nn.Module):
def __init__(self, input_size, hidden_size):
super(QLSTM_Cell, self).__init__()
self.hidden_size = hidden_size
self.concat_size = (input_size * 2) + hidden_size
self.classifier_input = nn.Linear(self.concat_size, N_QUBITS_LSTM * 4)
weight_shapes = {"weights": (N_QLAYERS, N_QUBITS_LSTM)}
self.vqc_forget = qml.qnn.TorchLayer(create_vqc(N_QUBITS_LSTM, N_QLAYERS), weight_shapes)
self.vqc_input = qml.qnn.TorchLayer(create_vqc(N_QUBITS_LSTM, N_QLAYERS), weight_shapes)
self.vqc_update = qml.qnn.TorchLayer(create_vqc(N_QUBITS_LSTM, N_QLAYERS), weight_shapes)
self.vqc_output = qml.qnn.TorchLayer(create_vqc(N_QUBITS_LSTM, N_QLAYERS), weight_shapes)
self.fc_forget = nn.Linear(N_QUBITS_LSTM, hidden_size)
self.fc_input = nn.Linear(N_QUBITS_LSTM, hidden_size)
self.fc_update = nn.Linear(N_QUBITS_LSTM, hidden_size)
self.fc_output = nn.Linear(N_QUBITS_LSTM, hidden_size)
def forward(self, x_complex, states):
h_t, c_t = states
x_real, x_imag = x_complex
combined = torch.cat((x_real, x_imag, h_t), dim=1)
vqc_inputs = self.classifier_input(combined).view(-1, 4, N_QUBITS_LSTM)
f_q, i_q, c_tilde_q, o_q = [vqc(vqc_inputs[:, i, :]) for i, vqc in
enumerate([self.vqc_forget, self.vqc_input, self.vqc_update, self.vqc_output])]
f_t, i_t, c_tilde, o_t = torch.sigmoid(self.fc_forget(f_q)), torch.sigmoid(self.fc_input(i_q)), torch.tanh(self.fc_update(c_tilde_q)), torch.sigmoid(self.fc_output(o_q))
c_t_next = f_t * c_t + i_t * c_tilde
h_t_next = o_t * torch.tanh(c_t_next)
return h_t_next, c_t_next
class Sentiment_QLSTM(nn.Module):
from torch.cuda.amp import GradScaler, autocast
def train_and_evaluate():
# 1. Load Data
train_loader, val_loader, test_loader = get_data_loaders(IMDB_DATASET_PATH)
if train_loader is None: return
# 2. Initialize Model and Optimizer
model = SentimentQSA_QLSTM(
embedding_dim=EMBEDDING_DIM,
hidden_size=HIDDEN_SIZE
) #.to(DEVICE)
model = model.to(dtype=torch.float64).to(DEVICE)
# Gỡ bỏ torch.compile()
# model = torch.compile(model)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
scaler = GradScaler() # Khởi tạo GradScaler cho AMP
# 3. Training Loop
print("\n--- Starting QSA-QLSTM Model Training with AMP ---")
history = {'train_loss': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}
for epoch in range(EPOCHS):
start_time = time.time()
model.train()
epoch_loss = 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False)
for data, target in pbar:
data = data.to(DEVICE)
target = data.to(dtype=torch.float64).to(DEVICE)
optimizer.zero_grad(set_to_none=True) # Tối ưu hóa việc xóa gradient
with autocast():
output = model(data).squeeze(1)
# Đảm bảo target có shape phù hợp
loss = criterion(output, target)
# Scale loss và thực hiện backward pass
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
epoch_loss += loss.item()
pbar.set_postfix({'loss': loss.item()})
avg_train_loss = epoch_loss / len(train_loader)
history['train_loss'].append(avg_train_loss)
# Validation (không cần AMP trong validation)
val_loss, val_acc, val_f1, _, _ = evaluate_model(model, val_loader, criterion)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
history['val_f1'].append(val_f1)
epoch_duration = time.time() - start_time
print(f"Epoch {epoch+1:02d}/{EPOCHS} | Time: {epoch_duration:.2f}s | "
f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | "
f"Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")
# 4. Final Testing
print("\n--- Testing on Final Test Set ---")
# ... (rest of the evaluation code is identical and can be copied from the previous response)
# Run the entire process
train_and_evaluate()
I have a long time for train model, can i try to train faster ?