updating LSTMDQN.py to incorporate Optuna tuning into the DQN model training, and possible rerunning if the DQN sucks

This commit is contained in:
2025-01-29 18:32:56 +00:00
parent 2ca8890e5f
commit 78ef2965b8
3 changed files with 440 additions and 1046 deletions

View File

@@ -1 +1,2 @@
venv/
.python-version .python-version

View File

@@ -3,42 +3,44 @@ import sys
import argparse import argparse
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging import logging
from tabulate import tabulate from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler # TensorFlow / Keras
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf import tensorflow as tf
from tensorflow.keras.models import Sequential from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam, Nadam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import Huber from tensorflow.keras.losses import Huber
from tensorflow.keras.regularizers import l2 from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam, Nadam
import xgboost as xgb # Sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
# Optuna
import optuna import optuna
from optuna.integration import KerasPruningCallback from optuna.integration import KerasPruningCallback
# For Reinforcement Learning # RL stuff
import gym import gym
from gym import spaces from gym import spaces
from stable_baselines3 import DQN from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
# Suppress TensorFlow warnings beyond errors # Suppress TensorFlow logs beyond errors
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
######################################################
################################################### # 1. DATA LOADING & ADVANCED TECHNICAL INDICATORS
# 1. Data Loading / Advanced Technical Indicators ######################################################
###################################################
def load_data(file_path): def load_data(file_path):
logging.info(f"Loading data from: {file_path}") logging.info(f"Loading data from: {file_path}")
try: try:
@@ -68,7 +70,6 @@ def load_data(file_path):
logging.info("Data loaded and sorted successfully.") logging.info("Data loaded and sorted successfully.")
return df return df
def compute_rsi(series, window=14): def compute_rsi(series, window=14):
delta = series.diff() delta = series.diff()
gain = delta.where(delta>0, 0).rolling(window=window).mean() gain = delta.where(delta>0, 0).rolling(window=window).mean()
@@ -76,7 +77,6 @@ def compute_rsi(series, window=14):
RS = gain / (loss+1e-9) RS = gain / (loss+1e-9)
return 100 - (100/(1+RS)) return 100 - (100/(1+RS))
def compute_macd(series, span_short=12, span_long=26, span_signal=9): def compute_macd(series, span_short=12, span_long=26, span_signal=9):
ema_short = series.ewm(span=span_short, adjust=False).mean() ema_short = series.ewm(span=span_short, adjust=False).mean()
ema_long = series.ewm(span=span_long, adjust=False).mean() ema_long = series.ewm(span=span_long, adjust=False).mean()
@@ -84,14 +84,11 @@ def compute_macd(series, span_short=12, span_long=26, span_signal=9):
signal_line = macd_line.ewm(span=span_signal, adjust=False).mean() signal_line = macd_line.ewm(span=span_signal, adjust=False).mean()
return macd_line - signal_line # histogram return macd_line - signal_line # histogram
def compute_obv(df): def compute_obv(df):
signed_volume = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0) signed_volume = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0)
return signed_volume.cumsum() return signed_volume.cumsum()
def compute_adx(df, window=14): def compute_adx(df, window=14):
"""Pseudo-ADX approach using rolling True Range / Close."""
df['H-L'] = df['High'] - df['Low'] df['H-L'] = df['High'] - df['Low']
df['H-Cp'] = (df['High'] - df['Close'].shift(1)).abs() df['H-Cp'] = (df['High'] - df['Close'].shift(1)).abs()
df['L-Cp'] = (df['Low'] - df['Close'].shift(1)).abs() df['L-Cp'] = (df['Low'] - df['Close'].shift(1)).abs()
@@ -101,7 +98,6 @@ def compute_adx(df, window=14):
df.drop(['H-L','H-Cp','L-Cp'], axis=1, inplace=True) df.drop(['H-L','H-Cp','L-Cp'], axis=1, inplace=True)
return adx_placeholder return adx_placeholder
def compute_bollinger_bands(series, window=20, num_std=2): def compute_bollinger_bands(series, window=20, num_std=2):
sma = series.rolling(window=window).mean() sma = series.rolling(window=window).mean()
std = series.rolling(window=window).std() std = series.rolling(window=window).std()
@@ -110,7 +106,6 @@ def compute_bollinger_bands(series, window=20, num_std=2):
bandwidth = (upper - lower)/(sma + 1e-9) bandwidth = (upper - lower)/(sma + 1e-9)
return upper, lower, bandwidth return upper, lower, bandwidth
def compute_mfi(df, window=14): def compute_mfi(df, window=14):
typical_price = (df['High']+ df['Low']+ df['Close'])/3 typical_price = (df['High']+ df['Low']+ df['Close'])/3
money_flow = typical_price* df['Volume'] money_flow = typical_price* df['Volume']
@@ -122,22 +117,19 @@ def compute_mfi(df, window=14):
mfi= 100-(100/(1+ pos_sum/(neg_sum+1e-9))) mfi= 100-(100/(1+ pos_sum/(neg_sum+1e-9)))
return mfi return mfi
def calculate_technical_indicators(df): def calculate_technical_indicators(df):
logging.info("Calculating technical indicators...") logging.info("Calculating technical indicators...")
df['RSI'] = compute_rsi(df['Close'], 14) df['RSI'] = compute_rsi(df['Close'], 14)
df['MACD']= compute_macd(df['Close']) df['MACD']= compute_macd(df['Close'])
df['OBV'] = compute_obv(df) df['OBV'] = compute_obv(df)
df['ADX'] = compute_adx(df) df['ADX'] = compute_adx(df)
up, low, bw = compute_bollinger_bands(df['Close'], 20, 2) up, lo, bw = compute_bollinger_bands(df['Close'], 20, 2)
df['BB_Upper']= up df['BB_Upper']= up
df['BB_Lower'] = low df['BB_Lower']= lo
df['BB_Width']= bw df['BB_Width']= bw
df['MFI'] = compute_mfi(df,14) df['MFI'] = compute_mfi(df,14)
df['SMA_5'] = df['Close'].rolling(5).mean() df['SMA_5'] = df['Close'].rolling(5).mean()
df['SMA_10']= df['Close'].rolling(10).mean() df['SMA_10']= df['Close'].rolling(10).mean()
df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean() df['EMA_5'] = df['Close'].ewm(span=5, adjust=False).mean()
@@ -150,67 +142,123 @@ def calculate_technical_indicators(df):
############################### ###############################
# 2. ARGUMENT PARSING # 2. ARG PARSING
############################### ###############################
def parse_arguments(): def parse_arguments():
parser = argparse.ArgumentParser(description='Train LSTM and DQN models for stock trading.') parser = argparse.ArgumentParser(description='All-in-One: LSTM + DQN (with LSTM predictions) + Tuning.')
parser.add_argument('csv_path', type=str, help='Path to the CSV data file (with columns time,open,high,low,close,volume).') parser.add_argument('csv_path', type=str,
parser.add_argument('--do_dqn_inference', action='store_true', help='Path to CSV data with columns [time, open, high, low, close, volume].')
help='If set, will run the DQN inference after training the agent.') parser.add_argument('--lstm_window_size', type=int, default=15,
help='Sequence window size for LSTM. Default=15.')
parser.add_argument('--dqn_total_timesteps', type=int, default=50000,
help='Total timesteps to train each DQN candidate. Default=50000.')
parser.add_argument('--dqn_eval_episodes', type=int, default=1,
help='Number of episodes to evaluate DQN in the tuning step. Default=1 (entire dataset once).')
parser.add_argument('--n_trials_lstm', type=int, default=30,
help='Number of Optuna trials for LSTM. Default=30.')
parser.add_argument('--n_trials_dqn', type=int, default=20,
help='Number of Optuna trials for DQN. Default=20.')
return parser.parse_args() return parser.parse_args()
############################### ###############################
# 3. MAIN # 3. CUSTOM DQN CALLBACK: LOG ACTIONS + REWARDS
###############################
class ActionLoggingCallback(BaseCallback):
"""
Logs distribution of actions and average reward after each rollout.
For off-policy (DQN), "rollout" can be a bit different than on-policy,
but stable-baselines3 still calls `_on_rollout_end` periodically.
"""
def __init__(self, verbose=0):
super().__init__(verbose)
self.action_buffer = []
self.reward_buffer = []
def _on_training_start(self):
self.action_buffer = []
self.reward_buffer = []
def _on_step(self):
action = self.locals.get('action', None)
reward = self.locals.get('reward', None)
if action is not None:
self.action_buffer.append(action)
if reward is not None:
self.reward_buffer.append(reward)
return True
def _on_rollout_end(self):
import numpy as np
actions = np.array(self.action_buffer)
rewards = np.array(self.reward_buffer)
if len(actions)>0:
unique, counts = np.unique(actions, return_counts=True)
total = len(actions)
distr_str = []
for act,c in zip(unique, counts):
distr_str.append(f"Action {act}: {c} times ({100*c/total:.2f}%)")
logging.info(" -- DQN Rollout End -- ")
logging.info(" " + ", ".join(distr_str))
logging.info(f" Avg Reward this rollout: {rewards.mean():.4f} (min={rewards.min():.4f}, max={rewards.max():.4f})")
self.action_buffer = []
self.reward_buffer = []
###############################
# 4. MAIN
############################### ###############################
def main(): def main():
# Parse command-line arguments
args = parse_arguments() args = parse_arguments()
csv_path = args.csv_path csv_path = args.csv_path
do_dqn_inference = args.do_dqn_inference lstm_window_size = args.lstm_window_size
dqn_total_timesteps = args.dqn_total_timesteps
dqn_eval_episodes = args.dqn_eval_episodes
n_trials_lstm = args.n_trials_lstm
n_trials_dqn = args.n_trials_dqn
# 1) Load Data ##########################################
data = load_data(csv_path) # A) LSTM PART: LOAD, PREPROCESS, TUNE
data = calculate_technical_indicators(data) ##########################################
# 1) LOAD & preprocess
df = load_data(csv_path)
df = calculate_technical_indicators(df)
# We'll exclude 'Close' from the feature set
feature_columns = [ feature_columns = [
'SMA_5','SMA_10','EMA_5','EMA_10','STDDEV_5', 'SMA_5','SMA_10','EMA_5','EMA_10','STDDEV_5',
'RSI','MACD','ADX','OBV','Volume','Open','High','Low', 'RSI','MACD','ADX','OBV','Volume','Open','High','Low',
'BB_Upper','BB_Lower','BB_Width','MFI' 'BB_Upper','BB_Lower','BB_Width','MFI'
] ]
target_column = 'Close' target_column = 'Close'
data = data[['Date'] + feature_columns + [target_column]] df = df[['Date']+ feature_columns+[target_column]].dropna()
data.dropna(inplace=True)
# 2) Scaling from sklearn.preprocessing import MinMaxScaler
scaler_features = MinMaxScaler() scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler() scaler_target = MinMaxScaler()
X_all = data[feature_columns].values X_all = df[feature_columns].values
y_all = data[[target_column]].values y_all = df[[target_column]].values
X_scaled = scaler_features.fit_transform(X_all) X_scaled = scaler_features.fit_transform(X_all)
y_scaled = scaler_target.fit_transform(y_all).flatten() y_scaled = scaler_target.fit_transform(y_all).flatten()
# 3) Create LSTM sequences # 2) Create sequences
def create_sequences(features, target, window_size=15): def create_sequences(features, target, window_size):
X_seq, y_seq = [], [] X_seq, y_seq = [], []
for i in range(len(features) - window_size): for i in range(len(features) - window_size):
X_seq.append(features[i:i+window_size]) X_seq.append(features[i:i+window_size])
y_seq.append(target[i+window_size]) y_seq.append(target[i+window_size])
return np.array(X_seq), np.array(y_seq) return np.array(X_seq), np.array(y_seq)
window_size = 15 X, y = create_sequences(X_scaled, y_scaled, lstm_window_size)
X, y = create_sequences(X_scaled, y_scaled, window_size)
# 4) Train/Val/Test Split # 3) Split into train/val/test
train_size = int(len(X)*0.7) train_size = int(len(X)*0.7)
val_size = int(len(X)*0.15) val_size = int(len(X)*0.15)
test_size = len(X)- train_size- val_size test_size = len(X)- train_size- val_size
X_train, X_val, X_test = X[:train_size], X[train_size:train_size+val_size], X[train_size+val_size:] X_train, y_train = X[:train_size], y[:train_size]
y_train, y_val, y_test = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:] X_val, y_val = X[train_size: train_size+ val_size], y[train_size: train_size+ val_size]
X_test, y_test = X[train_size+ val_size:], y[train_size+ val_size:]
logging.info(f"Scaled training features shape: {X_train.shape}") logging.info(f"Scaled training features shape: {X_train.shape}")
logging.info(f"Scaled validation features shape: {X_val.shape}") logging.info(f"Scaled validation features shape: {X_val.shape}")
@@ -219,14 +267,14 @@ def main():
logging.info(f"Scaled validation target shape: {y_val.shape}") logging.info(f"Scaled validation target shape: {y_val.shape}")
logging.info(f"Scaled testing target shape: {y_test.shape}") logging.info(f"Scaled testing target shape: {y_test.shape}")
# 5) GPU or CPU # 4) GPU config
def configure_device(): def configure_device():
gpus = tf.config.list_physical_devices('GPU') gpus = tf.config.list_physical_devices('GPU')
if gpus: if gpus:
try: try:
for gpu in gpus: for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_memory_growth(gpu, True)
logging.info(f"{len(gpus)} GPU(s) detected and configured.") logging.info(f"{len(gpus)} GPU(s) detected & configured.")
except RuntimeError as e: except RuntimeError as e:
logging.error(e) logging.error(e)
else: else:
@@ -234,33 +282,38 @@ def main():
configure_device() configure_device()
# 6) Build LSTM # 5) Build LSTM function
def build_advanced_lstm(input_shape, hyperparams): def build_lstm(input_shape, hyperparams):
from tensorflow.keras.regularizers import l2
model = Sequential() model = Sequential()
for i in range(hyperparams['num_lstm_layers']): num_layers = hyperparams['num_lstm_layers']
return_seqs = (i < hyperparams['num_lstm_layers'] - 1) units = hyperparams['lstm_units']
drop = hyperparams['dropout_rate']
for i in range(num_layers):
return_seqs = (i< num_layers-1)
model.add(Bidirectional( model.add(Bidirectional(
LSTM(hyperparams['lstm_units'], LSTM(units, return_sequences=return_seqs, kernel_regularizer=l2(1e-4)),
return_sequences=return_seqs,
kernel_regularizer=tf.keras.regularizers.l2(0.001)),
input_shape=input_shape if i==0 else None input_shape=input_shape if i==0 else None
)) ))
model.add(Dropout(hyperparams['dropout_rate'])) model.add(Dropout(drop))
model.add(Dense(1, activation='linear')) model.add(Dense(1, activation='linear'))
if hyperparams['optimizer'] == 'Adam': opt_name= hyperparams['optimizer']
opt = Adam(learning_rate=hyperparams['learning_rate'], decay=hyperparams['decay']) lr = hyperparams['learning_rate']
elif hyperparams['optimizer'] == 'Nadam': decay = hyperparams['decay']
opt = Nadam(learning_rate=hyperparams['learning_rate']) if opt_name=='Adam':
opt= Adam(learning_rate=lr, decay=decay)
elif opt_name=='Nadam':
opt= Nadam(learning_rate=lr)
else: else:
opt = Adam(learning_rate=hyperparams['learning_rate']) opt= Adam(learning_rate=lr)
model.compile(optimizer=opt, loss=Huber(), metrics=['mae']) model.compile(loss=Huber(), optimizer=opt, metrics=['mae'])
return model return model
# 7) Optuna Tuning # 6) Optuna objective for LSTM
def objective(trial): def lstm_objective(trial):
import tensorflow as tf
num_lstm_layers = trial.suggest_int('num_lstm_layers',1,3) num_lstm_layers = trial.suggest_int('num_lstm_layers',1,3)
lstm_units = trial.suggest_categorical('lstm_units',[32,64,96,128]) lstm_units = trial.suggest_categorical('lstm_units',[32,64,96,128])
dropout_rate = trial.suggest_float('dropout_rate',0.1,0.5) dropout_rate = trial.suggest_float('dropout_rate',0.1,0.5)
@@ -277,7 +330,7 @@ def main():
'decay': decay 'decay': decay
} }
model_ = build_advanced_lstm((X_train.shape[1], X_train.shape[2]), hyperparams) model_ = build_lstm((X_train.shape[1], X_train.shape[2]), hyperparams)
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6) lr_reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
cb_prune = KerasPruningCallback(trial, 'val_loss') cb_prune = KerasPruningCallback(trial, 'val_loss')
@@ -293,21 +346,19 @@ def main():
val_mae = min(history.history['val_mae']) val_mae = min(history.history['val_mae'])
return val_mae return val_mae
logging.info("Starting hyperparameter optimization with Optuna...") logging.info("Starting LSTM hyperparam optimization with Optuna...")
study = optuna.create_study(direction='minimize') study_lstm= optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) study_lstm.optimize(lstm_objective, n_trials=n_trials_lstm)
best_lstm_params = study_lstm.best_params
best_params = study.best_params logging.info(f"Best LSTM Hyperparams: {best_lstm_params}")
logging.info(f"Best Hyperparameters from Optuna: {best_params}")
# 8) Train the Best LSTM
best_model = build_advanced_lstm((X_train.shape[1], X_train.shape[2]), best_params)
# 7) Train final LSTM
final_lstm = build_lstm((X_train.shape[1], X_train.shape[2]), best_lstm_params)
early_stop_final= EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True) early_stop_final= EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
lr_reduce_final= ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6) lr_reduce_final= ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
logging.info("Training the best LSTM model with optimized hyperparameters...") logging.info("Training best LSTM model with found hyperparams...")
history = best_model.fit( hist= final_lstm.fit(
X_train,y_train, X_train,y_train,
epochs=300, epochs=300,
batch_size=16, batch_size=16,
@@ -316,93 +367,98 @@ def main():
verbose=1 verbose=1
) )
# 9) Evaluate # Evaluate LSTM
def evaluate_model(model, X_test, y_test): def evaluate_lstm(model, X_test, y_test):
logging.info("Evaluating model (LSTM)...") logging.info("Evaluating final LSTM...")
y_pred_scaled= model.predict(X_test).flatten() y_pred_scaled= model.predict(X_test).flatten()
y_pred_scaled= np.clip(y_pred_scaled,0,1) y_pred_scaled= np.clip(y_pred_scaled,0,1)
y_pred = scaler_target.inverse_transform(y_pred_scaled.reshape(-1,1)).flatten() y_pred = scaler_target.inverse_transform(y_pred_scaled.reshape(-1,1)).flatten()
y_test_actual= scaler_target.inverse_transform(y_test.reshape(-1,1)).flatten() y_test_actual= scaler_target.inverse_transform(y_test.reshape(-1,1)).flatten()
mse = mean_squared_error(y_test_actual, y_pred) mse_= mean_squared_error(y_test_actual,y_pred)
rmse= np.sqrt(mse) rmse_= np.sqrt(mse_)
mae = mean_absolute_error(y_test_actual, y_pred) mae_ = mean_absolute_error(y_test_actual,y_pred)
r2 = r2_score(y_test_actual, y_pred) r2_ = r2_score(y_test_actual,y_pred)
direction_actual= np.sign(np.diff(y_test_actual)) direction_actual= np.sign(np.diff(y_test_actual))
direction_pred = np.sign(np.diff(y_pred)) direction_pred = np.sign(np.diff(y_pred))
directional_accuracy= np.mean(direction_actual== direction_pred) directional_accuracy= np.mean(direction_actual== direction_pred)
logging.info(f"Test MSE: {mse}") logging.info(f"Test MSE: {mse_}")
logging.info(f"Test RMSE: {rmse}") logging.info(f"Test RMSE: {rmse_}")
logging.info(f"Test MAE: {mae}") logging.info(f"Test MAE: {mae_}")
logging.info(f"Test R2 Score: {r2}") logging.info(f"Test R2 Score: {r2_}")
logging.info(f"Directional Accuracy: {directional_accuracy}") logging.info(f"Directional Accuracy: {directional_accuracy}")
# Plot # Plot
plt.figure(figsize=(14,7)) plt.figure(figsize=(14,7))
plt.plot(y_test_actual, label='Actual Price') plt.plot(y_test_actual, label='Actual Price')
plt.plot(y_pred, label='Predicted Price') plt.plot(y_pred, label='Predicted Price')
plt.title('Actual vs Predicted Prices') plt.title('LSTM: Actual vs Predicted')
plt.xlabel('Time Step')
plt.ylabel('Price')
plt.legend() plt.legend()
plt.grid(True) plt.grid(True)
plt.savefig('actual_vs_predicted.png') plt.savefig('lstm_actual_vs_pred.png')
plt.close() plt.close()
logging.info("Plot saved as 'actual_vs_predicted.png'")
# Tabulate first 40 # Tabulate first 40
table_data = [] table=[]
for i in range(min(40, len(y_test_actual))): limit= min(40,len(y_test_actual))
table_data.append([i, round(y_test_actual[i],2), round(y_pred[i],2)]) for i in range(limit):
table.append([i, round(y_test_actual[i],2), round(y_pred[i],2)])
headers= ["Index","Actual Price","Predicted Price"] headers= ["Index","Actual Price","Predicted Price"]
print(tabulate(table_data, headers=headers, tablefmt="pretty")) print(tabulate(table, headers=headers, tablefmt="pretty"))
return r2_, directional_accuracy
return mse, rmse, mae, r2, directional_accuracy _r2, _diracc= evaluate_lstm(final_lstm, X_test, y_test)
mse, rmse, mae, r2, directional_accuracy = evaluate_model(best_model, X_test, y_test) # Save LSTM + scalers
final_lstm.save('best_lstm_model.h5')
joblib.dump(scaler_features,'scaler_features.pkl')
joblib.dump(scaler_target, 'scaler_target.pkl')
logging.info("Saved best LSTM model + scalers (best_lstm_model.h5, scaler_features.pkl, scaler_target.pkl).")
# 10) Save ############################################################
best_model.save('optimized_lstm_model.h5') # B) DQN PART: BUILD ENV THAT USES THE LSTM + FORECAST
import joblib ############################################################
joblib.dump(scaler_features, 'scaler_features.save') class StockTradingEnvWithLSTM(gym.Env):
joblib.dump(scaler_target, 'scaler_target.save')
logging.info("Model and scalers saved (optimized_lstm_model.h5, scaler_features.save, scaler_target.save).")
##########################################################
# 11) Reinforcement Learning: StockTradingEnv + DQN
##########################################################
class StockTradingEnv(gym.Env):
""" """
A simple stock trading environment for OpenAI Gym An environment that uses the LSTM model's predicted next day close
with step-based reward = net_worth - initial_balance. as part of the observation:
obs = [technical indicators, balance, shares, cost_basis, predicted_next_close].
Reward => net_worth - initial_balance each step.
""" """
metadata = {'render.modes':['human']} metadata = {'render.modes':['human']}
def __init__(self, df, initial_balance=10000, transaction_cost=0.001): def __init__(self, df, feature_columns, lstm_model, scaler_features, scaler_target,
window_size=15, initial_balance=10000, transaction_cost=0.001):
super().__init__() super().__init__()
self.df = df.reset_index() self.df= df.reset_index(drop=True)
self.feature_columns= feature_columns
self.lstm_model= lstm_model
self.scaler_features= scaler_features
self.scaler_target= scaler_target
self.window_size= window_size
self.initial_balance= initial_balance self.initial_balance= initial_balance
self.balance= initial_balance self.balance= initial_balance
self.net_worth= initial_balance self.net_worth= initial_balance
self.transaction_cost= transaction_cost
self.max_steps= len(df) self.max_steps= len(df)
self.current_step=0 self.current_step=0
self.shares_held=0 self.shares_held=0
self.cost_basis=0 self.cost_basis=0
self.transaction_cost = transaction_cost
# Re-use feature_columns from above # raw array of features
self.feature_columns = feature_columns self.raw_features= df[feature_columns].values
# Action space: 0=Sell,1=Hold,2=Buy # 0=Sell,1=Hold,2=Buy
self.action_space= spaces.Discrete(3) self.action_space= spaces.Discrete(3)
# Observation = [17 indicators + balance + shares + cost_basis] => total 20 # observation dimension = len(feature_columns)+3 +1 => 17 + 3 +1=21
self.observation_space= spaces.Box( self.observation_space= spaces.Box(
low=0, low=0, high=1,
high=1, shape=(len(feature_columns)+3+1,),
shape=(len(self.feature_columns)+3,),
dtype=np.float32 dtype=np.float32
) )
@@ -412,26 +468,42 @@ def main():
self.current_step=0 self.current_step=0
self.shares_held=0 self.shares_held=0
self.cost_basis=0 self.cost_basis=0
return self._next_observation() return self._get_obs()
def _next_observation(self): def _get_obs(self):
obs_vals = self.df.loc[self.current_step, self.feature_columns].values row= self.raw_features[self.current_step]
# simple normalization row_max= np.max(row) if np.max(row)!=0 else 1.0
if np.max(obs_vals)!=0: row_norm= row/row_max
obs_vals = obs_vals / np.max(obs_vals)
# account info
additional= np.array([ additional= np.array([
self.balance/self.initial_balance, self.balance/self.initial_balance,
self.shares_held/100.0, self.shares_held/100.0,
self.cost_basis/self.initial_balance self.cost_basis/(self.initial_balance+1e-9)
], dtype=np.float32) ], dtype=np.float32)
return np.concatenate([obs_vals, additional]).astype(np.float32) # LSTM prediction
if self.current_step< self.window_size:
# not enough history => no forecast
predicted_close= 0.0
else:
seq= self.raw_features[self.current_step - self.window_size: self.current_step]
seq_scaled= self.scaler_features.transform(seq)
seq_scaled= np.expand_dims(seq_scaled, axis=0) # shape (1, window_size, #features)
pred_scaled= self.lstm_model.predict(seq_scaled, verbose=0).flatten()[0]
pred_scaled= np.clip(pred_scaled,0,1)
unscaled= self.scaler_target.inverse_transform([[pred_scaled]])[0,0]
# either keep raw or scale it. We'll do a naive scale by /1000 if typical price is double digits
predicted_close= unscaled/1000.0
obs= np.concatenate([row_norm, additional, [predicted_close]]).astype(np.float32)
return obs
def step(self, action): def step(self, action):
prev_net_worth= self.net_worth
current_price= self.df.loc[self.current_step,'Close'] current_price= self.df.loc[self.current_step,'Close']
if action==2: # Buy if action==2: # BUY
shares_bought= int(self.balance// current_price) shares_bought= int(self.balance// current_price)
if shares_bought>0: if shares_bought>0:
cost= shares_bought* current_price cost= shares_bought* current_price
@@ -439,12 +511,11 @@ def main():
self.balance-= (cost+ fee) self.balance-= (cost+ fee)
old_shares= self.shares_held old_shares= self.shares_held
self.shares_held+= shares_bought self.shares_held+= shares_bought
# Weighted average cost
self.cost_basis=( self.cost_basis=(
(self.cost_basis* old_shares)+ (shares_bought* current_price) (self.cost_basis* old_shares)+ (shares_bought* current_price)
)/ self.shares_held )/ self.shares_held
elif action==0: # Sell elif action==0: # SELL
if self.shares_held>0: if self.shares_held>0:
revenue= self.shares_held* current_price revenue= self.shares_held* current_price
fee= revenue* self.transaction_cost fee= revenue* self.transaction_cost
@@ -452,95 +523,169 @@ def main():
self.shares_held=0 self.shares_held=0
self.cost_basis=0 self.cost_basis=0
prev_net_worth= self.net_worth
self.net_worth= self.balance+ self.shares_held* current_price self.net_worth= self.balance+ self.shares_held* current_price
self.current_step+=1 self.current_step+=1
done= (self.current_step>= self.max_steps -1) done= (self.current_step>= self.max_steps -1)
# Reward: net_worth - initial_balance (like original code)
reward= self.net_worth- self.initial_balance reward= self.net_worth- self.initial_balance
obs= self._get_obs()
obs= self._next_observation()
return obs, reward, done, {} return obs, reward, done, {}
def render(self, mode='human'): def render(self, mode='human'):
profit= self.net_worth- self.initial_balance profit= self.net_worth- self.initial_balance
print(f"Step: {self.current_step}, " print(f"Step: {self.current_step}, "
f"Balance: {self.balance:.2f}, " f"Balance={self.balance:.2f}, "
f"Shares: {self.shares_held}, " f"Shares={self.shares_held}, "
f"NetWorth: {self.net_worth:.2f}, " f"NetWorth={self.net_worth:.2f}, "
f"Profit: {profit:.2f}") f"Profit={profit:.2f}")
###################################
# C) DQN HYPERPARAM TUNING W/ LSTM
###################################
# We'll define a function that trains a DQN with trial hyperparams,
# then evaluates final net worth on one run.
from stable_baselines3.common.evaluation import evaluate_policy
# We'll define a small function to do final net worth check:
def evaluate_dqn_networth(model, env, n_episodes=1):
# We do a simple loop that runs the entire dataset (1 episode)
# to see final net worth.
# If you want multiple episodes, you can do multiple resets in random start positions, etc.
final_net_worths = []
for _ in range(n_episodes):
obs= env.reset()
done= False
while not done:
action, _= model.predict(obs, deterministic=True)
obs, reward, done, info= env.step(action)
final_net_worths.append(env.net_worth)
return np.mean(final_net_worths)
# We'll define the DQN objective with Optuna
def dqn_objective(trial):
# we sample some DQN hyperparams
lr = trial.suggest_loguniform("lr", 1e-5, 1e-2)
gamma = trial.suggest_float("gamma", 0.8, 0.9999)
exploration_fraction= trial.suggest_float("exploration_fraction", 0.01, 0.3)
buffer_size = trial.suggest_categorical("buffer_size",[5000,10000,20000])
batch_size = trial.suggest_categorical("batch_size",[32,64,128])
# Build environment fresh each time or reuse:
# We'll reuse the same data environment but new instance
env = StockTradingEnvWithLSTM(
df=df,
feature_columns= feature_columns,
lstm_model= final_lstm, # use the best LSTM
scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
vec_env = DummyVecEnv([lambda: env])
# Build DQN
from stable_baselines3 import DQN
from stable_baselines3.common.callbacks import BaseCallback
dqn_action_logger = ActionLoggingCallback(verbose=0)
def train_dqn_agent(env):
logging.info("Training DQN Agent (step-based reward).")
try:
model = DQN( model = DQN(
'MlpPolicy', 'MlpPolicy',
env, vec_env,
verbose=1, verbose=0,
learning_rate=1e-3, learning_rate= lr,
buffer_size=10000, gamma= gamma,
learning_starts=1000, exploration_fraction= exploration_fraction,
batch_size=64, buffer_size= buffer_size,
tau=1.0, batch_size= batch_size,
gamma=0.99,
train_freq=4, train_freq=4,
target_update_interval=1000, target_update_interval=1000,
exploration_fraction=0.1, # etc
exploration_final_eps=0.02,
tensorboard_log="./dqn_stock_tensorboard/"
) )
model.learn(total_timesteps=100000) # Train some timesteps
model.save("dqn_stock_trading") model.learn(total_timesteps= dqn_total_timesteps, callback=dqn_action_logger)
logging.info("DQN Agent trained and saved as 'dqn_stock_trading.zip'.")
return model
except Exception as e:
logging.error(f"Error training DQN Agent: {e}")
sys.exit(1)
# Evaluate final net worth
final_net_worth= evaluate_dqn_networth(model, env, n_episodes=dqn_eval_episodes)
# we want to maximize net worth => minimize negative net worth
return -final_net_worth
# 12) Train the DQN environment logging.info("Starting DQN hyperparam tuning with Optuna (using LSTM environment)...")
logging.info("Initializing trading environment for DQN training...") study_dqn = optuna.create_study(direction='minimize')
trading_env = StockTradingEnv(data, initial_balance=10000, transaction_cost=0.001) study_dqn.optimize(dqn_objective, n_trials=n_trials_dqn)
vec_env = DummyVecEnv([lambda: trading_env]) best_dqn_params = study_dqn.best_params
logging.info(f"Best DQN hyperparams: {best_dqn_params}")
dqn_model = train_dqn_agent(vec_env) ###################################
logging.info("DQN training complete.") # D) TRAIN FINAL DQN WITH BEST PARAMS
###################################
logging.info("Training final DQN with best hyperparams & LSTM environment...")
# 13) Optional: run DQN inference right away (like use_dqn.py) if user wants env_final = StockTradingEnvWithLSTM(
if do_dqn_inference: df=df,
logging.info("Running DQN inference (test) after training...") feature_columns=feature_columns,
obs = vec_env.reset() lstm_model= final_lstm,
done = [False] scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
vec_env_final = DummyVecEnv([lambda: env_final])
# Build final model
final_dqn_logger = ActionLoggingCallback(verbose=1) # We'll see logs each rollout
final_model= DQN(
'MlpPolicy',
vec_env_final,
verbose=1,
learning_rate= best_dqn_params['lr'],
gamma= best_dqn_params['gamma'],
exploration_fraction= best_dqn_params['exploration_fraction'],
buffer_size= best_dqn_params['buffer_size'],
batch_size= best_dqn_params['batch_size'],
train_freq=4,
target_update_interval=1000
# etc if you want other params
)
final_model.learn(total_timesteps= dqn_total_timesteps, callback= final_dqn_logger)
final_model.save("best_dqn_model_lstm.zip")
###################################
# E) FINAL INFERENCE & LOG RESULTS
###################################
logging.info("Running final inference with best DQN...")
env_test = StockTradingEnvWithLSTM(
df=df,
feature_columns= feature_columns,
lstm_model= final_lstm,
scaler_features= scaler_features,
scaler_target= scaler_target,
window_size= lstm_window_size
)
obs = env_test.reset()
done=False
total_reward=0.0 total_reward=0.0
step_data=[] step_data=[]
step_count=0 step_count=0
# underlying env to access net worth, etc. while not done:
underlying_env = vec_env.envs[0]
while not done[0]:
step_count+=1 step_count+=1
action, _ = dqn_model.predict(obs, deterministic=True) action, _= final_model.predict(obs, deterministic=True)
obs, reward, done, info = vec_env.step(action) obs, reward, done, info= env_test.step(action)
reward_scalar = reward[0] total_reward+= reward
total_reward += reward_scalar
step_data.append({ step_data.append({
"Step": step_count, "Step": step_count,
"Action": int(action[0]), "Action": int(action),
"Reward": reward_scalar, "Reward": reward,
"Balance": underlying_env.balance, "Balance": env_test.balance,
"Shares": underlying_env.shares_held, "Shares": env_test.shares_held,
"NetWorth": underlying_env.net_worth "NetWorth": env_test.net_worth
}) })
final_net_worth = underlying_env.net_worth final_net_worth= env_test.net_worth
final_profit = final_net_worth - underlying_env.initial_balance final_profit= final_net_worth - env_test.initial_balance
print("\n=== DQN Agent Finished ===") print("\n=== Final DQN Inference ===")
print(f"Total Steps Taken: {step_count}") print(f"Total Steps: {step_count}")
print(f"Final Net Worth: {final_net_worth:.2f}") print(f"Final Net Worth: {final_net_worth:.2f}")
print(f"Final Profit: {final_profit:.2f}") print(f"Final Profit: {final_profit:.2f}")
print(f"Sum of Rewards: {total_reward:.2f}") print(f"Sum of Rewards: {total_reward:.2f}")
@@ -550,17 +695,20 @@ def main():
hold_count = sum(1 for x in step_data if x["Action"]==1) hold_count = sum(1 for x in step_data if x["Action"]==1)
print(f"Actions Taken -> BUY:{buy_count}, SELL:{sell_count}, HOLD:{hold_count}") print(f"Actions Taken -> BUY:{buy_count}, SELL:{sell_count}, HOLD:{hold_count}")
# Show last 15 steps (like use_dqn) # Show last 15 steps
steps_to_display = 15 last_n= step_data[-15:] if len(step_data)>15 else step_data
last_n = step_data[-steps_to_display:] if len(step_data)> steps_to_display else step_data
rows=[] rows=[]
for d in last_n: for d in last_n:
rows.append([ rows.append([
d["Step"], d["Action"], f"{d['Reward']:.2f}", d["Step"],
f"{d['Balance']:.2f}", d["Shares"], f"{d['NetWorth']:.2f}" d["Action"],
f"{d['Reward']:.2f}",
f"{d['Balance']:.2f}",
d["Shares"],
f"{d['NetWorth']:.2f}"
]) ])
headers= ["Step","Action","Reward","Balance","Shares","NetWorth"] headers= ["Step","Action","Reward","Balance","Shares","NetWorth"]
print(f"\n== Last {steps_to_display} Steps ==") print(f"\n== Last 15 Steps ==")
print(tabulate(rows, headers=headers, tablefmt="pretty")) print(tabulate(rows, headers=headers, tablefmt="pretty"))
logging.info("All tasks complete. Exiting.") logging.info("All tasks complete. Exiting.")

File diff suppressed because one or more lines are too long