Starter Code

This commit is contained in:
2025-01-29 23:39:42 -05:00
parent d32023c882
commit b4fde9997e
1113 changed files with 210062 additions and 0 deletions

View File

@@ -0,0 +1,49 @@
# midas/analysis.py
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D
import numpy as np
class MarketRegimeAnalysis:
def __init__(self, model, features):
self.model = model
self.features = features
self.states = model.predict(features)
def plot_regimes(self, prices: pd.Series):
plt.figure(figsize=(15, 8))
palette = sns.color_palette("husl", n_colors=self.model.n_components)
for state in range(self.model.n_components):
mask = self.states == state
plt.scatter(prices.index[mask], prices[mask],
color=palette[state], s=10, label=f'Regime {state}')
plt.title("Market Regime Visualization")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
return plt
def plot_transition_matrix(self):
transmat = self.model.transmat_
plt.figure(figsize=(10, 8))
sns.heatmap(transmat, annot=True, fmt=".2f", cmap="Blues",
xticklabels=range(transmat.shape[0]),
yticklabels=range(transmat.shape[1]))
plt.title("State Transition Probabilities")
plt.xlabel("Next State")
plt.ylabel("Current State")
return plt
def plot_state_durations(self):
state_changes = np.diff(self.states, prepend=self.states[0])
change_points = np.where(state_changes != 0)[0]
durations = np.diff(np.append(change_points, len(self.states)))
plt.figure(figsize=(10, 6))
sns.histplot(durations, bins=30, kde=True)
plt.title("Regime Duration Distribution")
plt.xlabel("Duration (Bars)")
plt.ylabel("Frequency")
return plt

View File

@@ -0,0 +1,95 @@
# midas/data_processor.py
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from typing import List, Dict
class DataProcessor:
def __init__(self, config: Dict):
self.config = config
self.logger = logging.getLogger(__name__)
def _load_raw_data(self, ticker: str) -> pd.DataFrame:
file_path = Path(self.config['data_dir']) / f"{ticker}_5min_3years.csv"
if not file_path.exists():
raise FileNotFoundError(f"Data file not found: {file_path}")
df = pd.read_csv(
file_path,
parse_dates=['timestamp'],
usecols=['timestamp', 'open', 'high', 'low', 'close', 'volume'],
dtype={
'open': 'float32',
'high': 'float32',
'low': 'float32',
'close': 'float32',
'volume': 'float32'
}
)
return df.sort_values('timestamp').set_index('timestamp')
def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""Robust data cleaning pipeline"""
# Handle missing values
df = df.dropna()
# Validate price data
self._validate_prices(df)
# Validate trading hours
df = self._filter_trading_hours(df)
# Remove outliers
df = self._remove_price_outliers(df)
df = self._remove_volume_outliers(df)
# Resample and forward fill missing intervals
df = df.resample(self.config['resample_freq']).last().ffill()
return df
def _validate_prices(self, df: pd.DataFrame):
"""Ensure price data integrity"""
if (df['close'] <= 0).any():
bad_values = df[df['close'] <= 0]
self.logger.error(f"Invalid close prices: {bad_values.index}")
raise ValueError("Negative/zero close prices detected")
if not (df['high'] >= df['low']).all():
raise ValueError("High prices < Low prices detected")
if not df['close'].is_monotonic_increasing:
self.logger.warning("Non-monotonic timestamps detected")
def _filter_trading_hours(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove non-market hours (if intraday data)"""
if pd.infer_freq(df.index) in ('H', 'T'):
return df.between_time('09:30', '16:00')
return df
def _remove_price_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove prices beyond 10 standard deviations"""
returns = np.log(df['close']).diff().dropna()
mask = (returns.abs() < 10 * returns.std()).reindex(df.index).ffill()
return df[mask]
def _remove_volume_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove volume spikes beyond 20 standard deviations"""
log_volume = np.log(df['volume'])
mask = (log_volume < log_volume.mean() + 20*log_volume.std())
return df[mask]
def process_tickers(self) -> Dict[str, pd.DataFrame]:
"""Process all tickers with cleaning and resampling"""
processed = {}
for ticker in self.config['tickers']:
try:
self.logger.info(f"Processing {ticker}")
df = self._load_raw_data(ticker)
df = self._clean_data(df)
processed[ticker] = df
except Exception as e:
self.logger.error(f"Failed processing {ticker}: {str(e)}")
raise
return processed

View File

@@ -0,0 +1,72 @@
# midas/feature_engineer.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from typing import Dict
class FeatureEngineer:
def __init__(self, config: Dict):
self.config = config
self.scaler = RobustScaler() # Handles outliers
def _calculate_obv(self, df: pd.DataFrame) -> pd.Series:
"""On-Balance Volume"""
obv = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
return obv.pct_change(periods=14) # Normalized OBV
def calculate_features(self, data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
all_features = []
for ticker, df in data.items():
features = pd.DataFrame(index=df.index)
# Price-based features
if 'returns' in self.config['features']:
features['returns'] = np.log(df['close']).diff()
if 'volatility' in self.config['features']:
features['volatility'] = features['returns'].rolling(20).std() * np.sqrt(252)
if 'rsi' in self.config['features']:
delta = df['close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = -delta.where(delta < 0, 0).rolling(14).mean()
features['rsi'] = 100 - (100 / (1 + (gain / loss)))
if 'macd' in self.config['features']:
ema12 = df['close'].ewm(span=12, adjust=False).mean()
ema26 = df['close'].ewm(span=26, adjust=False).mean()
features['macd'] = ema12 - ema26
if 'atr' in self.config['features']:
high_low = df['high'] - df['low']
high_close = (df['high'] - df['close'].shift()).abs()
low_close = (df['low'] - df['close'].shift()).abs()
tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
features['atr'] = tr.rolling(14).mean() / df['close']
if 'volume_change' in self.config['features']:
features['volume_change'] = np.log(df['volume'] / df['volume'].shift(1))
if 'obv' in self.config['features']:
features['obv'] = self._calculate_obv(df)
# Add ticker identifier if combining
if self.config['combine_tickers']:
features['ticker'] = ticker
all_features.append(features.dropna())
combined = pd.concat(all_features).sort_index()
# Encode tickers if combining
if self.config['combine_tickers']:
combined = pd.get_dummies(combined, columns=['ticker'], prefix='', prefix_sep='')
# Scale features
scaled = pd.DataFrame(
self.scaler.fit_transform(combined),
index=combined.index,
columns=combined.columns
)
return scaled

View File

@@ -0,0 +1,74 @@
# midas/hmm_trainer.py
import numpy as np
from hmmlearn import hmm
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import silhouette_score
import logging
class HMMTrainer:
def __init__(self, config: Dict):
self.config = config
self.logger = logging.getLogger(__name__)
self.best_model = None
def _calculate_bic(self, model, X):
"""Bayesian Information Criterion"""
log_likelihood = model.score(X)
n_params = model.n_components * (model.n_components - 1) + \
model.n_components * X.shape[1] * 2 # Means and variances
return -2 * log_likelihood + n_params * np.log(X.shape[0])
def train(self, features: pd.DataFrame):
best_score = -np.inf
best_model = None
# Time-series cross validation
tscv = TimeSeriesSplit(n_splits=3)
for n_components in range(*self.config['n_states_range']):
try:
fold_scores = []
for train_idx, test_idx in tscv.split(features):
X_train = features.iloc[train_idx]
X_test = features.iloc[test_idx]
model = hmm.GaussianHMM(
n_components=n_components,
covariance_type="diag",
n_iter=1000,
random_state=42
)
model.fit(X_train)
# Score using both likelihood and regime persistence
log_likelihood = model.score(X_test)
states = model.predict(X_test)
persistence = np.mean([len(list(g)) for _, g in groupby(states)])
score = log_likelihood + persistence
fold_scores.append(score)
avg_score = np.mean(fold_scores)
bic = self._calculate_bic(model, features)
self.logger.info(f"States {n_components}: BIC={bic:.2f}, Score={avg_score:.2f}")
if avg_score > best_score:
best_score = avg_score
self.best_model = model
except Exception as e:
self.logger.error(f"Failed training {n_components} states: {str(e)}")
if not self.best_model:
raise RuntimeError("No valid models trained")
self.best_model.fit(features) # Final training on full dataset
return self.best_model
def save_model(self, path: str):
joblib.dump({
'model': self.best_model,
'scaler': self.scaler,
'config': self.config
}, path)