Starter Code
This commit is contained in:
BIN
MidasHMM/hmm/midas/__pycache__/data_processor.cpython-39.pyc
Normal file
BIN
MidasHMM/hmm/midas/__pycache__/data_processor.cpython-39.pyc
Normal file
Binary file not shown.
BIN
MidasHMM/hmm/midas/__pycache__/feature_engineer.cpython-39.pyc
Normal file
BIN
MidasHMM/hmm/midas/__pycache__/feature_engineer.cpython-39.pyc
Normal file
Binary file not shown.
BIN
MidasHMM/hmm/midas/__pycache__/hmm_trainer.cpython-39.pyc
Normal file
BIN
MidasHMM/hmm/midas/__pycache__/hmm_trainer.cpython-39.pyc
Normal file
Binary file not shown.
49
MidasHMM/hmm/midas/analysis.py
Normal file
49
MidasHMM/hmm/midas/analysis.py
Normal file
@@ -0,0 +1,49 @@
|
||||
# midas/analysis.py
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from matplotlib.lines import Line2D
|
||||
import numpy as np
|
||||
|
||||
class MarketRegimeAnalysis:
|
||||
def __init__(self, model, features):
|
||||
self.model = model
|
||||
self.features = features
|
||||
self.states = model.predict(features)
|
||||
|
||||
def plot_regimes(self, prices: pd.Series):
|
||||
plt.figure(figsize=(15, 8))
|
||||
palette = sns.color_palette("husl", n_colors=self.model.n_components)
|
||||
|
||||
for state in range(self.model.n_components):
|
||||
mask = self.states == state
|
||||
plt.scatter(prices.index[mask], prices[mask],
|
||||
color=palette[state], s=10, label=f'Regime {state}')
|
||||
|
||||
plt.title("Market Regime Visualization")
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Price")
|
||||
plt.legend()
|
||||
return plt
|
||||
|
||||
def plot_transition_matrix(self):
|
||||
transmat = self.model.transmat_
|
||||
plt.figure(figsize=(10, 8))
|
||||
sns.heatmap(transmat, annot=True, fmt=".2f", cmap="Blues",
|
||||
xticklabels=range(transmat.shape[0]),
|
||||
yticklabels=range(transmat.shape[1]))
|
||||
plt.title("State Transition Probabilities")
|
||||
plt.xlabel("Next State")
|
||||
plt.ylabel("Current State")
|
||||
return plt
|
||||
|
||||
def plot_state_durations(self):
|
||||
state_changes = np.diff(self.states, prepend=self.states[0])
|
||||
change_points = np.where(state_changes != 0)[0]
|
||||
durations = np.diff(np.append(change_points, len(self.states)))
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
sns.histplot(durations, bins=30, kde=True)
|
||||
plt.title("Regime Duration Distribution")
|
||||
plt.xlabel("Duration (Bars)")
|
||||
plt.ylabel("Frequency")
|
||||
return plt
|
||||
95
MidasHMM/hmm/midas/data_processor.py
Normal file
95
MidasHMM/hmm/midas/data_processor.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# midas/data_processor.py
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from typing import List, Dict
|
||||
|
||||
class DataProcessor:
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def _load_raw_data(self, ticker: str) -> pd.DataFrame:
|
||||
file_path = Path(self.config['data_dir']) / f"{ticker}_5min_3years.csv"
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"Data file not found: {file_path}")
|
||||
|
||||
df = pd.read_csv(
|
||||
file_path,
|
||||
parse_dates=['timestamp'],
|
||||
usecols=['timestamp', 'open', 'high', 'low', 'close', 'volume'],
|
||||
dtype={
|
||||
'open': 'float32',
|
||||
'high': 'float32',
|
||||
'low': 'float32',
|
||||
'close': 'float32',
|
||||
'volume': 'float32'
|
||||
}
|
||||
)
|
||||
return df.sort_values('timestamp').set_index('timestamp')
|
||||
|
||||
def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Robust data cleaning pipeline"""
|
||||
# Handle missing values
|
||||
df = df.dropna()
|
||||
|
||||
# Validate price data
|
||||
self._validate_prices(df)
|
||||
|
||||
# Validate trading hours
|
||||
df = self._filter_trading_hours(df)
|
||||
|
||||
# Remove outliers
|
||||
df = self._remove_price_outliers(df)
|
||||
df = self._remove_volume_outliers(df)
|
||||
|
||||
# Resample and forward fill missing intervals
|
||||
df = df.resample(self.config['resample_freq']).last().ffill()
|
||||
|
||||
return df
|
||||
|
||||
def _validate_prices(self, df: pd.DataFrame):
|
||||
"""Ensure price data integrity"""
|
||||
if (df['close'] <= 0).any():
|
||||
bad_values = df[df['close'] <= 0]
|
||||
self.logger.error(f"Invalid close prices: {bad_values.index}")
|
||||
raise ValueError("Negative/zero close prices detected")
|
||||
|
||||
if not (df['high'] >= df['low']).all():
|
||||
raise ValueError("High prices < Low prices detected")
|
||||
|
||||
if not df['close'].is_monotonic_increasing:
|
||||
self.logger.warning("Non-monotonic timestamps detected")
|
||||
|
||||
def _filter_trading_hours(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Remove non-market hours (if intraday data)"""
|
||||
if pd.infer_freq(df.index) in ('H', 'T'):
|
||||
return df.between_time('09:30', '16:00')
|
||||
return df
|
||||
|
||||
def _remove_price_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Remove prices beyond 10 standard deviations"""
|
||||
returns = np.log(df['close']).diff().dropna()
|
||||
mask = (returns.abs() < 10 * returns.std()).reindex(df.index).ffill()
|
||||
return df[mask]
|
||||
|
||||
def _remove_volume_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Remove volume spikes beyond 20 standard deviations"""
|
||||
log_volume = np.log(df['volume'])
|
||||
mask = (log_volume < log_volume.mean() + 20*log_volume.std())
|
||||
return df[mask]
|
||||
|
||||
def process_tickers(self) -> Dict[str, pd.DataFrame]:
|
||||
"""Process all tickers with cleaning and resampling"""
|
||||
processed = {}
|
||||
for ticker in self.config['tickers']:
|
||||
try:
|
||||
self.logger.info(f"Processing {ticker}")
|
||||
df = self._load_raw_data(ticker)
|
||||
df = self._clean_data(df)
|
||||
processed[ticker] = df
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed processing {ticker}: {str(e)}")
|
||||
raise
|
||||
return processed
|
||||
72
MidasHMM/hmm/midas/feature_engineer.py
Normal file
72
MidasHMM/hmm/midas/feature_engineer.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# midas/feature_engineer.py
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
from typing import Dict
|
||||
|
||||
class FeatureEngineer:
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.scaler = RobustScaler() # Handles outliers
|
||||
|
||||
def _calculate_obv(self, df: pd.DataFrame) -> pd.Series:
|
||||
"""On-Balance Volume"""
|
||||
obv = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
|
||||
return obv.pct_change(periods=14) # Normalized OBV
|
||||
|
||||
def calculate_features(self, data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
|
||||
all_features = []
|
||||
|
||||
for ticker, df in data.items():
|
||||
features = pd.DataFrame(index=df.index)
|
||||
|
||||
# Price-based features
|
||||
if 'returns' in self.config['features']:
|
||||
features['returns'] = np.log(df['close']).diff()
|
||||
|
||||
if 'volatility' in self.config['features']:
|
||||
features['volatility'] = features['returns'].rolling(20).std() * np.sqrt(252)
|
||||
|
||||
if 'rsi' in self.config['features']:
|
||||
delta = df['close'].diff()
|
||||
gain = delta.where(delta > 0, 0).rolling(14).mean()
|
||||
loss = -delta.where(delta < 0, 0).rolling(14).mean()
|
||||
features['rsi'] = 100 - (100 / (1 + (gain / loss)))
|
||||
|
||||
if 'macd' in self.config['features']:
|
||||
ema12 = df['close'].ewm(span=12, adjust=False).mean()
|
||||
ema26 = df['close'].ewm(span=26, adjust=False).mean()
|
||||
features['macd'] = ema12 - ema26
|
||||
|
||||
if 'atr' in self.config['features']:
|
||||
high_low = df['high'] - df['low']
|
||||
high_close = (df['high'] - df['close'].shift()).abs()
|
||||
low_close = (df['low'] - df['close'].shift()).abs()
|
||||
tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
|
||||
features['atr'] = tr.rolling(14).mean() / df['close']
|
||||
|
||||
if 'volume_change' in self.config['features']:
|
||||
features['volume_change'] = np.log(df['volume'] / df['volume'].shift(1))
|
||||
|
||||
if 'obv' in self.config['features']:
|
||||
features['obv'] = self._calculate_obv(df)
|
||||
|
||||
# Add ticker identifier if combining
|
||||
if self.config['combine_tickers']:
|
||||
features['ticker'] = ticker
|
||||
|
||||
all_features.append(features.dropna())
|
||||
|
||||
combined = pd.concat(all_features).sort_index()
|
||||
|
||||
# Encode tickers if combining
|
||||
if self.config['combine_tickers']:
|
||||
combined = pd.get_dummies(combined, columns=['ticker'], prefix='', prefix_sep='')
|
||||
|
||||
# Scale features
|
||||
scaled = pd.DataFrame(
|
||||
self.scaler.fit_transform(combined),
|
||||
index=combined.index,
|
||||
columns=combined.columns
|
||||
)
|
||||
return scaled
|
||||
74
MidasHMM/hmm/midas/hmm_trainer.py
Normal file
74
MidasHMM/hmm/midas/hmm_trainer.py
Normal file
@@ -0,0 +1,74 @@
|
||||
# midas/hmm_trainer.py
|
||||
import numpy as np
|
||||
from hmmlearn import hmm
|
||||
import joblib
|
||||
from sklearn.model_selection import TimeSeriesSplit
|
||||
from sklearn.metrics import silhouette_score
|
||||
import logging
|
||||
|
||||
class HMMTrainer:
|
||||
def __init__(self, config: Dict):
|
||||
self.config = config
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.best_model = None
|
||||
|
||||
def _calculate_bic(self, model, X):
|
||||
"""Bayesian Information Criterion"""
|
||||
log_likelihood = model.score(X)
|
||||
n_params = model.n_components * (model.n_components - 1) + \
|
||||
model.n_components * X.shape[1] * 2 # Means and variances
|
||||
return -2 * log_likelihood + n_params * np.log(X.shape[0])
|
||||
|
||||
def train(self, features: pd.DataFrame):
|
||||
best_score = -np.inf
|
||||
best_model = None
|
||||
|
||||
# Time-series cross validation
|
||||
tscv = TimeSeriesSplit(n_splits=3)
|
||||
|
||||
for n_components in range(*self.config['n_states_range']):
|
||||
try:
|
||||
fold_scores = []
|
||||
for train_idx, test_idx in tscv.split(features):
|
||||
X_train = features.iloc[train_idx]
|
||||
X_test = features.iloc[test_idx]
|
||||
|
||||
model = hmm.GaussianHMM(
|
||||
n_components=n_components,
|
||||
covariance_type="diag",
|
||||
n_iter=1000,
|
||||
random_state=42
|
||||
)
|
||||
model.fit(X_train)
|
||||
|
||||
# Score using both likelihood and regime persistence
|
||||
log_likelihood = model.score(X_test)
|
||||
states = model.predict(X_test)
|
||||
persistence = np.mean([len(list(g)) for _, g in groupby(states)])
|
||||
score = log_likelihood + persistence
|
||||
|
||||
fold_scores.append(score)
|
||||
|
||||
avg_score = np.mean(fold_scores)
|
||||
bic = self._calculate_bic(model, features)
|
||||
self.logger.info(f"States {n_components}: BIC={bic:.2f}, Score={avg_score:.2f}")
|
||||
|
||||
if avg_score > best_score:
|
||||
best_score = avg_score
|
||||
self.best_model = model
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed training {n_components} states: {str(e)}")
|
||||
|
||||
if not self.best_model:
|
||||
raise RuntimeError("No valid models trained")
|
||||
|
||||
self.best_model.fit(features) # Final training on full dataset
|
||||
return self.best_model
|
||||
|
||||
def save_model(self, path: str):
|
||||
joblib.dump({
|
||||
'model': self.best_model,
|
||||
'scaler': self.scaler,
|
||||
'config': self.config
|
||||
}, path)
|
||||
Reference in New Issue
Block a user