Files
gwitt1Repo/MidasHMM/hmm/midas/feature_engineer.py
2025-01-29 23:39:42 -05:00

73 lines
3.0 KiB
Python

# midas/feature_engineer.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler
from typing import Dict
class FeatureEngineer:
def __init__(self, config: Dict):
self.config = config
self.scaler = RobustScaler() # Handles outliers
def _calculate_obv(self, df: pd.DataFrame) -> pd.Series:
"""On-Balance Volume"""
obv = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
return obv.pct_change(periods=14) # Normalized OBV
def calculate_features(self, data: Dict[str, pd.DataFrame]) -> pd.DataFrame:
all_features = []
for ticker, df in data.items():
features = pd.DataFrame(index=df.index)
# Price-based features
if 'returns' in self.config['features']:
features['returns'] = np.log(df['close']).diff()
if 'volatility' in self.config['features']:
features['volatility'] = features['returns'].rolling(20).std() * np.sqrt(252)
if 'rsi' in self.config['features']:
delta = df['close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = -delta.where(delta < 0, 0).rolling(14).mean()
features['rsi'] = 100 - (100 / (1 + (gain / loss)))
if 'macd' in self.config['features']:
ema12 = df['close'].ewm(span=12, adjust=False).mean()
ema26 = df['close'].ewm(span=26, adjust=False).mean()
features['macd'] = ema12 - ema26
if 'atr' in self.config['features']:
high_low = df['high'] - df['low']
high_close = (df['high'] - df['close'].shift()).abs()
low_close = (df['low'] - df['close'].shift()).abs()
tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
features['atr'] = tr.rolling(14).mean() / df['close']
if 'volume_change' in self.config['features']:
features['volume_change'] = np.log(df['volume'] / df['volume'].shift(1))
if 'obv' in self.config['features']:
features['obv'] = self._calculate_obv(df)
# Add ticker identifier if combining
if self.config['combine_tickers']:
features['ticker'] = ticker
all_features.append(features.dropna())
combined = pd.concat(all_features).sort_index()
# Encode tickers if combining
if self.config['combine_tickers']:
combined = pd.get_dummies(combined, columns=['ticker'], prefix='', prefix_sep='')
# Scale features
scaled = pd.DataFrame(
self.scaler.fit_transform(combined),
index=combined.index,
columns=combined.columns
)
return scaled