updating LSTMDQN.py to incorporate Optuna tuning into the DQN model training, and possible rerunning if the DQN sucks

This commit is contained in:
2025-01-29 18:32:56 +00:00
parent 2ca8890e5f
commit 78ef2965b8
3 changed files with 440 additions and 1046 deletions

View File

@@ -1 +1,2 @@
venv/
.python-version

View File

@@ -3,42 +3,44 @@ import sys
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import Huber
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, Nadam
import xgboost as xgb
# Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
# Optuna
import optuna
from optuna.integration import KerasPruningCallback
# For Reinforcement Learning
# RL stuff
import gym
from gym import spaces
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
# Suppress TensorFlow warnings beyond errors
# Suppress TensorFlow logs beyond errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
###################################################
# 1. Data Loading / Advanced Technical Indicators
###################################################
######################################################
# 1. DATA LOADING & ADVANCED TECHNICAL INDICATORS
######################################################
def load_data(file_path):
logging.info(f"Loading data from: {file_path}")
try:
@@ -68,7 +70,6 @@ def load_data(file_path):
logging.info("Data loaded and sorted successfully.")
return df
def compute_rsi(series, window=14):
delta = series.diff()
gain = delta.where(delta>0, 0).rolling(window=window).mean()
@@ -76,7 +77,6 @@ def compute_rsi(series, window=14):
RS = gain / (loss+1e-9)
return 100 - (100/(1+RS))
def compute_macd(series, span_short=12, span_long=26, span_signal=9):
ema_short = series.ewm(span=span_short, adjust=False).mean()
ema_long = series.ewm(span=span_long, adjust=False).mean()
@@ -84,14 +84,11 @@ def compute_macd(series, span_short=12, span_long=26, span_signal=9):
signal_line = macd_line.ewm(span=span_signal, adjust=False).mean()
return macd_line - signal_line # histogram
def compute_obv(df):
signed_volume = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0)
return signed_volume.cumsum()
def compute_adx(df, window=14):
"""Pseudo-ADX approach using rolling True Range / Close."""
df['H-L'] = df['High'] - df['Low']
df['H-Cp'] = (df['High'] - df['Close'].shift(1)).abs()
df['L-Cp'] = (df['Low'] - df['Close'].shift(1)).abs()
@@ -101,7 +98,6 @@ def compute_adx(df, window=14):
df.drop(['H-L','H-Cp','L-Cp'], axis=1, inplace=True)
return adx_placeholder
def compute_bollinger_bands(series, window=20, num_std=2):
sma = series.rolling(window=window).mean()
std = series.rolling(window=window).std()
@@ -110,7 +106,6 @@ def compute_bollinger_bands(series, window=20, num_std=2):
bandwidth = (upper - lower)/(sma + 1e-9)
return upper, lower, bandwidth
def compute_mfi(df, window=14):
typical_price = (df['High']+ df['Low']+ df['Close'])/3
money_flow = typical_price* df['Volume']
@@ -122,22 +117,19 @@ def compute_mfi(df, window=14):
mfi= 100-(100/(1+ pos_sum/(neg_sum+1e-9)))
return mfi
def calculate_technical_indicators(df):
logging.info("Calculating technical indicators...")
df['RSI'] = compute_rsi(df['Close'], 14)
df['MACD']= compute_macd(df['Close'])
df['OBV'] = compute_obv(df)
df['ADX'] = compute_adx(df)
up, low, bw = compute_bollinger_bands(df['Close'], 20, 2)
up, lo, bw = compute_bollinger_bands(df['Close'], 20, 2)
df['BB_Upper']= up
df['BB_Lower'] = low
df['BB_Lower']= lo
df['BB_Width']= bw
df['MFI'] = compute_mfi(df,14)
df['SMA_5'] = df['Close'].rolling(5).mean()
df['SMA_10']= df['Close'].rolling(10).mean()
df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
@@ -150,67 +142,123 @@ def calculate_technical_indicators(df):
###############################
# 2. ARGUMENT PARSING
# 2. ARG PARSING
###############################
def parse_arguments():
parser = argparse.ArgumentParser(description='Train LSTM and DQN models for stock trading.')
parser.add_argument('csv_path', type=str, help='Path to the CSV data file (with columns time,open,high,low,close,volume).')
parser.add_argument('--do_dqn_inference', action='store_true',
help='If set, will run the DQN inference after training the agent.')
parser = argparse.ArgumentParser(description='All-in-One: LSTM + DQN (with LSTM predictions) + Tuning.')
parser.add_argument('csv_path', type=str,
help='Path to CSV data with columns [time, open, high, low, close, volume].')
parser.add_argument('--lstm_window_size', type=int, default=15,
help='Sequence window size for LSTM. Default=15.')
parser.add_argument('--dqn_total_timesteps', type=int, default=50000,
help='Total timesteps to train each DQN candidate. Default=50000.')
parser.add_argument('--dqn_eval_episodes', type=int, default=1,
help='Number of episodes to evaluate DQN in the tuning step. Default=1 (entire dataset once).')
parser.add_argument('--n_trials_lstm', type=int, default=30,
help='Number of Optuna trials for LSTM. Default=30.')
parser.add_argument('--n_trials_dqn', type=int, default=20,
help='Number of Optuna trials for DQN. Default=20.')
return parser.parse_args()
###############################
# 3. MAIN
# 3. CUSTOM DQN CALLBACK: LOG ACTIONS + REWARDS
###############################
class ActionLoggingCallback(BaseCallback):
"""
Logs distribution of actions and average reward after each rollout.
For off-policy (DQN), "rollout" can be a bit different than on-policy,
but stable-baselines3 still calls `_on_rollout_end` periodically.
"""
def __init__(self, verbose=0):
super().__init__(verbose)
self.action_buffer = []
self.reward_buffer = []
def _on_training_start(self):
self.action_buffer = []
self.reward_buffer = []
def _on_step(self):
action = self.locals.get('action', None)
reward = self.locals.get('reward', None)
if action is not None:
self.action_buffer.append(action)
if reward is not None:
self.reward_buffer.append(reward)
return True
def _on_rollout_end(self):
import numpy as np
actions = np.array(self.action_buffer)
rewards = np.array(self.reward_buffer)
if len(actions)>0:
unique, counts = np.unique(actions, return_counts=True)
total = len(actions)
distr_str = []
for act,c in zip(unique, counts):
distr_str.append(f"Action {act}: {c} times ({100*c/total:.2f}%)")
logging.info(" -- DQN Rollout End -- ")
logging.info(" " + ", ".join(distr_str))
logging.info(f" Avg Reward this rollout: {rewards.mean():.4f} (min={rewards.min():.4f}, max={rewards.max():.4f})")
self.action_buffer = []
self.reward_buffer = []
###############################
# 4. MAIN
###############################
def main():
# Parse command-line arguments
args = parse_arguments()
csv_path = args.csv_path
do_dqn_inference = args.do_dqn_inference
lstm_window_size = args.lstm_window_size
dqn_total_timesteps = args.dqn_total_timesteps
dqn_eval_episodes = args.dqn_eval_episodes
n_trials_lstm = args.n_trials_lstm
n_trials_dqn = args.n_trials_dqn
# 1) Load Data
data = load_data(csv_path)
data = calculate_technical_indicators(data)
##########################################
# A) LSTM PART: LOAD, PREPROCESS, TUNE
##########################################
# 1) LOAD & preprocess
df = load_data(csv_path)
df = calculate_technical_indicators(df)
# We'll exclude 'Close' from the feature set
feature_columns = [
'SMA_5','SMA_10','EMA_5','EMA_10','STDDEV_5',
'RSI','MACD','ADX','OBV','Volume','Open','High','Low',
'BB_Upper','BB_Lower','BB_Width','MFI'
]
target_column = 'Close'
data = data[['Date'] + feature_columns + [target_column]]
data.dropna(inplace=True)
df = df[['Date']+ feature_columns+[target_column]].dropna()
# 2) Scaling
from sklearn.preprocessing import MinMaxScaler
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()
X_all = data[feature_columns].values
y_all = data[[target_column]].values
X_all = df[feature_columns].values
y_all = df[[target_column]].values
X_scaled = scaler_features.fit_transform(X_all)
y_scaled = scaler_target.fit_transform(y_all).flatten()
# 3) Create LSTM sequences
def create_sequences(features, target, window_size=15):
# 2) Create sequences
def create_sequences(features, target, window_size):
X_seq, y_seq = [], []
for i in range(len(features) - window_size):
X_seq.append(features[i:i+window_size])
y_seq.append(target[i+window_size])
return np.array(X_seq), np.array(y_seq)
window_size = 15
X, y = create_sequences(X_scaled, y_scaled, window_size)
X, y = create_sequences(X_scaled, y_scaled, lstm_window_size)
# 4) Train/Val/Test Split
# 3) Split into train/val/test
train_size = int(len(X)*0.7)
val_size = int(len(X)*0.15)
test_size = len(X)- train_size- val_size
X_train, X_val, X_test = X[:train_size], X[train_size:train_size+val_size], X[train_size+val_size:]
y_train, y_val, y_test = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:]
X_train, y_train = X[:train_size], y[:train_size]
X_val, y_val = X[train_size: train_size+ val_size], y[train_size: train_size+ val_size]
X_test, y_test = X[train_size+ val_size:], y[train_size+ val_size:]
logging.info(f"Scaled training features shape: {X_train.shape}")
logging.info(f"Scaled validation features shape: {X_val.shape}")
@@ -219,14 +267,14 @@ def main():
logging.info(f"Scaled validation target shape: {y_val.shape}")
logging.info(f"Scaled testing target shape: {y_test.shape}")
# 5) GPU or CPU
# 4) GPU config
def configure_device():
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logging.info(f"{len(gpus)} GPU(s) detected and configured.")
logging.info(f"{len(gpus)} GPU(s) detected & configured.")
except RuntimeError as e:
logging.error(e)
else:
@@ -234,33 +282,38 @@ def main():
configure_device()
# 6) Build LSTM
def build_advanced_lstm(input_shape, hyperparams):
# 5) Build LSTM function
def build_lstm(input_shape, hyperparams):
from tensorflow.keras.regularizers import l2
model = Sequential()
for i in range(hyperparams['num_lstm_layers']):
return_seqs = (i < hyperparams['num_lstm_layers'] - 1)
num_layers = hyperparams['num_lstm_layers']
units = hyperparams['lstm_units']
drop = hyperparams['dropout_rate']
for i in range(num_layers):
return_seqs = (i< num_layers-1)
model.add(Bidirectional(
LSTM(hyperparams['lstm_units'],
return_sequences=return_seqs,
kernel_regularizer=tf.keras.regularizers.l2(0.001)),
LSTM(units, return_sequences=return_seqs, kernel_regularizer=l2(1e-4)),
input_shape=input_shape if i==0 else None
))
model.add(Dropout(hyperparams['dropout_rate']))
model.add(Dropout(drop))
model.add(Dense(1, activation='linear'))
if hyperparams['optimizer'] == 'Adam':
opt = Adam(learning_rate=hyperparams['learning_rate'], decay=hyperparams['decay'])
elif hyperparams['optimizer'] == 'Nadam':
opt = Nadam(learning_rate=hyperparams['learning_rate'])
opt_name= hyperparams['optimizer']
lr = hyperparams['learning_rate']
decay = hyperparams['decay']
if opt_name=='Adam':
opt= Adam(learning_rate=lr, decay=decay)
elif opt_name=='Nadam':
opt= Nadam(learning_rate=lr)
else:
opt = Adam(learning_rate=hyperparams['learning_rate'])
opt= Adam(learning_rate=lr)
model.compile(optimizer=opt, loss=Huber(), metrics=['mae'])
model.compile(loss=Huber(), optimizer=opt, metrics=['mae'])
return model
# 7) Optuna Tuning
def objective(trial):
# 6) Optuna objective for LSTM
def lstm_objective(trial):
import tensorflow as tf
num_lstm_layers = trial.suggest_int('num_lstm_layers',1,3)
lstm_units = trial.suggest_categorical('lstm_units',[32,64,96,128])
dropout_rate = trial.suggest_float('dropout_rate',0.1,0.5)
@@ -277,7 +330,7 @@ def main():
'decay': decay
}
model_ = build_advanced_lstm((X_train.shape[1], X_train.shape[2]), hyperparams)
model_ = build_lstm((X_train.shape[1], X_train.shape[2]), hyperparams)
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
cb_prune = KerasPruningCallback(trial, 'val_loss')
@@ -293,21 +346,19 @@ def main():
val_mae = min(history.history['val_mae'])
return val_mae
logging.info("Starting hyperparameter optimization with Optuna...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
best_params = study.best_params
logging.info(f"Best Hyperparameters from Optuna: {best_params}")
# 8) Train the Best LSTM
best_model = build_advanced_lstm((X_train.shape[1], X_train.shape[2]), best_params)
logging.info("Starting LSTM hyperparam optimization with Optuna...")
study_lstm= optuna.create_study(direction='minimize')
study_lstm.optimize(lstm_objective, n_trials=n_trials_lstm)
best_lstm_params = study_lstm.best_params
logging.info(f"Best LSTM Hyperparams: {best_lstm_params}")
# 7) Train final LSTM
final_lstm = build_lstm((X_train.shape[1], X_train.shape[2]), best_lstm_params)
early_stop_final= EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lr_reduce_final= ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
logging.info("Training the best LSTM model with optimized hyperparameters...")
history = best_model.fit(
logging.info("Training best LSTM model with found hyperparams...")
hist= final_lstm.fit(
X_train,y_train,
epochs=300,
batch_size=16,
@@ -316,93 +367,98 @@ def main():
verbose=1
)
# 9) Evaluate
def evaluate_model(model, X_test, y_test):
logging.info("Evaluating model (LSTM)...")
# Evaluate LSTM
def evaluate_lstm(model, X_test, y_test):
logging.info("Evaluating final LSTM...")
y_pred_scaled= model.predict(X_test).flatten()
y_pred_scaled= np.clip(y_pred_scaled,0,1)
y_pred = scaler_target.inverse_transform(y_pred_scaled.reshape(-1,1)).flatten()
y_test_actual= scaler_target.inverse_transform(y_test.reshape(-1,1)).flatten()
mse = mean_squared_error(y_test_actual, y_pred)
rmse= np.sqrt(mse)
mae = mean_absolute_error(y_test_actual, y_pred)
r2 = r2_score(y_test_actual, y_pred)
mse_= mean_squared_error(y_test_actual,y_pred)
rmse_= np.sqrt(mse_)
mae_ = mean_absolute_error(y_test_actual,y_pred)
r2_ = r2_score(y_test_actual,y_pred)
direction_actual= np.sign(np.diff(y_test_actual))
direction_pred = np.sign(np.diff(y_pred))
directional_accuracy= np.mean(direction_actual== direction_pred)
logging.info(f"Test MSE: {mse}")
logging.info(f"Test RMSE: {rmse}")
logging.info(f"Test MAE: {mae}")
logging.info(f"Test R2 Score: {r2}")
logging.info(f"Test MSE: {mse_}")
logging.info(f"Test RMSE: {rmse_}")
logging.info(f"Test MAE: {mae_}")
logging.info(f"Test R2 Score: {r2_}")
logging.info(f"Directional Accuracy: {directional_accuracy}")
# Plot
plt.figure(figsize=(14,7))
plt.plot(y_test_actual, label='Actual Price')
plt.plot(y_pred, label='Predicted Price')
plt.title('Actual vs Predicted Prices')
plt.xlabel('Time Step')
plt.ylabel('Price')
plt.title('LSTM: Actual vs Predicted')
plt.legend()
plt.grid(True)
plt.savefig('actual_vs_predicted.png')
plt.savefig('lstm_actual_vs_pred.png')
plt.close()
logging.info("Plot saved as 'actual_vs_predicted.png'")
# Tabulate first 40
table_data = []
for i in range(min(40, len(y_test_actual))):
table_data.append([i, round(y_test_actual[i],2), round(y_pred[i],2)])
table=[]
limit= min(40,len(y_test_actual))
for i in range(limit):
table.append([i, round(y_test_actual[i],2), round(y_pred[i],2)])
headers= ["Index","Actual Price","Predicted Price"]
print(tabulate(table_data, headers=headers, tablefmt="pretty"))
print(tabulate(table, headers=headers, tablefmt="pretty"))
return r2_, directional_accuracy
return mse, rmse, mae, r2, directional_accuracy
_r2, _diracc= evaluate_lstm(final_lstm, X_test, y_test)
mse, rmse, mae, r2, directional_accuracy = evaluate_model(best_model, X_test, y_test)
# Save LSTM + scalers
final_lstm.save('best_lstm_model.h5')
joblib.dump(scaler_features,'scaler_features.pkl')
joblib.dump(scaler_target, 'scaler_target.pkl')
logging.info("Saved best LSTM model + scalers (best_lstm_model.h5, scaler_features.pkl, scaler_target.pkl).")
# 10) Save
best_model.save('optimized_lstm_model.h5')
import joblib
joblib.dump(scaler_features, 'scaler_features.save')
joblib.dump(scaler_target, 'scaler_target.save')
logging.info("Model and scalers saved (optimized_lstm_model.h5, scaler_features.save, scaler_target.save).")
##########################################################
# 11) Reinforcement Learning: StockTradingEnv + DQN
##########################################################
class StockTradingEnv(gym.Env):
############################################################
# B) DQN PART: BUILD ENV THAT USES THE LSTM + FORECAST
############################################################
class StockTradingEnvWithLSTM(gym.Env):
"""
A simple stock trading environment for OpenAI Gym
with step-based reward = net_worth - initial_balance.
An environment that uses the LSTM model's predicted next day close
as part of the observation:
obs = [technical indicators, balance, shares, cost_basis, predicted_next_close].
Reward => net_worth - initial_balance each step.
"""
metadata = {'render.modes':['human']}
def __init__(self, df, initial_balance=10000, transaction_cost=0.001):
def __init__(self, df, feature_columns, lstm_model, scaler_features, scaler_target,
window_size=15, initial_balance=10000, transaction_cost=0.001):
super().__init__()
self.df = df.reset_index()
self.df= df.reset_index(drop=True)
self.feature_columns= feature_columns
self.lstm_model= lstm_model
self.scaler_features= scaler_features
self.scaler_target= scaler_target
self.window_size= window_size
self.initial_balance= initial_balance
self.balance= initial_balance
self.net_worth= initial_balance
self.transaction_cost= transaction_cost
self.max_steps= len(df)
self.current_step=0
self.shares_held=0
self.cost_basis=0
self.transaction_cost = transaction_cost
# Re-use feature_columns from above
self.feature_columns = feature_columns
# raw array of features
self.raw_features= df[feature_columns].values
# Action space: 0=Sell,1=Hold,2=Buy
# 0=Sell,1=Hold,2=Buy
self.action_space= spaces.Discrete(3)
# Observation = [17 indicators + balance + shares + cost_basis] => total 20
# observation dimension = len(feature_columns)+3 +1 => 17 + 3 +1=21
self.observation_space= spaces.Box(
low=0,
high=1,
shape=(len(self.feature_columns)+3,),
low=0, high=1,
shape=(len(feature_columns)+3+1,),
dtype=np.float32
)
@@ -412,26 +468,42 @@ def main():
self.current_step=0
self.shares_held=0
self.cost_basis=0
return self._next_observation()
return self._get_obs()
def _next_observation(self):
obs_vals = self.df.loc[self.current_step, self.feature_columns].values
# simple normalization
if np.max(obs_vals)!=0:
obs_vals = obs_vals / np.max(obs_vals)
def _get_obs(self):
row= self.raw_features[self.current_step]
row_max= np.max(row) if np.max(row)!=0 else 1.0
row_norm= row/row_max
# account info
additional= np.array([
self.balance/self.initial_balance,
self.shares_held/100.0,
self.cost_basis/self.initial_balance
self.cost_basis/(self.initial_balance+1e-9)
], dtype=np.float32)
return np.concatenate([obs_vals, additional]).astype(np.float32)
# LSTM prediction
if self.current_step< self.window_size:
# not enough history => no forecast
predicted_close= 0.0
else:
seq= self.raw_features[self.current_step - self.window_size: self.current_step]
seq_scaled= self.scaler_features.transform(seq)
seq_scaled= np.expand_dims(seq_scaled, axis=0) # shape (1, window_size, #features)
pred_scaled= self.lstm_model.predict(seq_scaled, verbose=0).flatten()[0]
pred_scaled= np.clip(pred_scaled,0,1)
unscaled= self.scaler_target.inverse_transform([[pred_scaled]])[0,0]
# either keep raw or scale it. We'll do a naive scale by /1000 if typical price is double digits
predicted_close= unscaled/1000.0
obs= np.concatenate([row_norm, additional, [predicted_close]]).astype(np.float32)
return obs
def step(self, action):
prev_net_worth= self.net_worth
current_price= self.df.loc[self.current_step,'Close']
if action==2: # Buy
if action==2: # BUY
shares_bought= int(self.balance// current_price)
if shares_bought>0:
cost= shares_bought* current_price
@@ -439,12 +511,11 @@ def main():
self.balance-= (cost+ fee)
old_shares= self.shares_held
self.shares_held+= shares_bought
# Weighted average cost
self.cost_basis=(
(self.cost_basis* old_shares)+ (shares_bought* current_price)
)/ self.shares_held
elif action==0: # Sell
elif action==0: # SELL
if self.shares_held>0:
revenue= self.shares_held* current_price
fee= revenue* self.transaction_cost
@@ -452,95 +523,169 @@ def main():
self.shares_held=0
self.cost_basis=0
prev_net_worth= self.net_worth
self.net_worth= self.balance+ self.shares_held* current_price
self.current_step+=1
done= (self.current_step>= self.max_steps -1)
# Reward: net_worth - initial_balance (like original code)
reward= self.net_worth- self.initial_balance
obs= self._next_observation()
obs= self._get_obs()
return obs, reward, done, {}
def render(self, mode='human'):
profit= self.net_worth- self.initial_balance
print(f"Step: {self.current_step}, "
f"Balance: {self.balance:.2f}, "
f"Shares: {self.shares_held}, "
f"NetWorth: {self.net_worth:.2f}, "
f"Profit: {profit:.2f}")
f"Balance={self.balance:.2f}, "
f"Shares={self.shares_held}, "
f"NetWorth={self.net_worth:.2f}, "
f"Profit={profit:.2f}")
###################################
# C) DQN HYPERPARAM TUNING W/ LSTM
###################################
# We'll define a function that trains a DQN with trial hyperparams,
# then evaluates final net worth on one run.
from stable_baselines3.common.evaluation import evaluate_policy
# We'll define a small function to do final net worth check:
def evaluate_dqn_networth(model, env, n_episodes=1):
# We do a simple loop that runs the entire dataset (1 episode)
# to see final net worth.
# If you want multiple episodes, you can do multiple resets in random start positions, etc.
final_net_worths = []
for _ in range(n_episodes):
obs= env.reset()
done= False
while not done:
action, _= model.predict(obs, deterministic=True)
obs, reward, done, info= env.step(action)
final_net_worths.append(env.net_worth)
return np.mean(final_net_worths)
# We'll define the DQN objective with Optuna
def dqn_objective(trial):
# we sample some DQN hyperparams
lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
gamma = trial.suggest_float("gamma", 0.8, 0.9999)
exploration_fraction= trial.suggest_float("exploration_fraction", 0.01, 0.3)
buffer_size = trial.suggest_categorical("buffer_size",[5000,10000,20000])
batch_size = trial.suggest_categorical("batch_size",[32,64,128])
# Build environment fresh each time or reuse:
# We'll reuse the same data environment but new instance
env = StockTradingEnvWithLSTM(
df=df,
feature_columns= feature_columns,
lstm_model= final_lstm, # use the best LSTM
scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
vec_env = DummyVecEnv([lambda: env])
# Build DQN
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
dqn_action_logger = ActionLoggingCallback(verbose=0)
def train_dqn_agent(env):
logging.info("Training DQN Agent (step-based reward).")
try:
model = DQN(
'MlpPolicy',
env,
verbose=1,
learning_rate=1e-3,
buffer_size=10000,
learning_starts=1000,
batch_size=64,
tau=1.0,
gamma=0.99,
vec_env,
verbose=0,
learning_rate= lr,
gamma= gamma,
exploration_fraction= exploration_fraction,
buffer_size= buffer_size,
batch_size= batch_size,
train_freq=4,
target_update_interval=1000,
exploration_fraction=0.1,
exploration_final_eps=0.02,
tensorboard_log="./dqn_stock_tensorboard/"
# etc
)
model.learn(total_timesteps=100000)
model.save("dqn_stock_trading")
logging.info("DQN Agent trained and saved as 'dqn_stock_trading.zip'.")
return model
except Exception as e:
logging.error(f"Error training DQN Agent: {e}")
sys.exit(1)
# Train some timesteps
model.learn(total_timesteps= dqn_total_timesteps, callback=dqn_action_logger)
# Evaluate final net worth
final_net_worth= evaluate_dqn_networth(model, env, n_episodes=dqn_eval_episodes)
# we want to maximize net worth => minimize negative net worth
return -final_net_worth
# 12) Train the DQN environment
logging.info("Initializing trading environment for DQN training...")
trading_env = StockTradingEnv(data, initial_balance=10000, transaction_cost=0.001)
vec_env = DummyVecEnv([lambda: trading_env])
logging.info("Starting DQN hyperparam tuning with Optuna (using LSTM environment)...")
study_dqn = optuna.create_study(direction='minimize')
study_dqn.optimize(dqn_objective, n_trials=n_trials_dqn)
best_dqn_params = study_dqn.best_params
logging.info(f"Best DQN hyperparams: {best_dqn_params}")
dqn_model = train_dqn_agent(vec_env)
logging.info("DQN training complete.")
###################################
# D) TRAIN FINAL DQN WITH BEST PARAMS
###################################
logging.info("Training final DQN with best hyperparams & LSTM environment...")
# 13) Optional: run DQN inference right away (like use_dqn.py) if user wants
if do_dqn_inference:
logging.info("Running DQN inference (test) after training...")
obs = vec_env.reset()
done = [False]
env_final = StockTradingEnvWithLSTM(
df=df,
feature_columns=feature_columns,
lstm_model= final_lstm,
scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
vec_env_final = DummyVecEnv([lambda: env_final])
# Build final model
final_dqn_logger = ActionLoggingCallback(verbose=1) # We'll see logs each rollout
final_model= DQN(
'MlpPolicy',
vec_env_final,
verbose=1,
learning_rate= best_dqn_params['lr'],
gamma= best_dqn_params['gamma'],
exploration_fraction= best_dqn_params['exploration_fraction'],
buffer_size= best_dqn_params['buffer_size'],
batch_size= best_dqn_params['batch_size'],
train_freq=4,
target_update_interval=1000
# etc if you want other params
)
final_model.learn(total_timesteps= dqn_total_timesteps, callback= final_dqn_logger)
final_model.save("best_dqn_model_lstm.zip")
###################################
# E) FINAL INFERENCE & LOG RESULTS
###################################
logging.info("Running final inference with best DQN...")
env_test = StockTradingEnvWithLSTM(
df=df,
feature_columns= feature_columns,
lstm_model= final_lstm,
scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
obs = env_test.reset()
done=False
total_reward=0.0
step_data=[]
step_count=0
# underlying env to access net worth, etc.
underlying_env = vec_env.envs[0]
while not done[0]:
while not done:
step_count+=1
action, _ = dqn_model.predict(obs, deterministic=True)
obs, reward, done, info = vec_env.step(action)
reward_scalar = reward[0]
total_reward += reward_scalar
action, _= final_model.predict(obs, deterministic=True)
obs, reward, done, info= env_test.step(action)
total_reward+= reward
step_data.append({
"Step": step_count,
"Action": int(action[0]),
"Reward": reward_scalar,
"Balance": underlying_env.balance,
"Shares": underlying_env.shares_held,
"NetWorth": underlying_env.net_worth
"Action": int(action),
"Reward": reward,
"Balance": env_test.balance,
"Shares": env_test.shares_held,
"NetWorth": env_test.net_worth
})
final_net_worth = underlying_env.net_worth
final_profit = final_net_worth - underlying_env.initial_balance
final_net_worth= env_test.net_worth
final_profit= final_net_worth - env_test.initial_balance
print("\n=== DQN Agent Finished ===")
print(f"Total Steps Taken: {step_count}")
print("\n=== Final DQN Inference ===")
print(f"Total Steps: {step_count}")
print(f"Final Net Worth: {final_net_worth:.2f}")
print(f"Final Profit: {final_profit:.2f}")
print(f"Sum of Rewards: {total_reward:.2f}")
@@ -550,17 +695,20 @@ def main():
hold_count = sum(1 for x in step_data if x["Action"]==1)
print(f"Actions Taken -> BUY:{buy_count}, SELL:{sell_count}, HOLD:{hold_count}")
# Show last 15 steps (like use_dqn)
steps_to_display = 15
last_n = step_data[-steps_to_display:] if len(step_data)> steps_to_display else step_data
# Show last 15 steps
last_n= step_data[-15:] if len(step_data)>15 else step_data
rows=[]
for d in last_n:
rows.append([
d["Step"], d["Action"], f"{d['Reward']:.2f}",
f"{d['Balance']:.2f}", d["Shares"], f"{d['NetWorth']:.2f}"
d["Step"],
d["Action"],
f"{d['Reward']:.2f}",
f"{d['Balance']:.2f}",
d["Shares"],
f"{d['NetWorth']:.2f}"
])
headers= ["Step","Action","Reward","Balance","Shares","NetWorth"]
print(f"\n== Last {steps_to_display} Steps ==")
print(f"\n== Last 15 Steps ==")
print(tabulate(rows, headers=headers, tablefmt="pretty"))
logging.info("All tasks complete. Exiting.")

File diff suppressed because one or more lines are too long