96 lines
3.5 KiB
Python
96 lines
3.5 KiB
Python
# midas/data_processor.py
|
|
import pandas as pd
|
|
import numpy as np
|
|
from pathlib import Path
|
|
import logging
|
|
from typing import List, Dict
|
|
|
|
class DataProcessor:
|
|
def __init__(self, config: Dict):
|
|
self.config = config
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def _load_raw_data(self, ticker: str) -> pd.DataFrame:
|
|
file_path = Path(self.config['data_dir']) / f"{ticker}_5min_3years.csv"
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"Data file not found: {file_path}")
|
|
|
|
df = pd.read_csv(
|
|
file_path,
|
|
parse_dates=['timestamp'],
|
|
usecols=['timestamp', 'open', 'high', 'low', 'close', 'volume'],
|
|
dtype={
|
|
'open': 'float32',
|
|
'high': 'float32',
|
|
'low': 'float32',
|
|
'close': 'float32',
|
|
'volume': 'float32'
|
|
}
|
|
)
|
|
return df.sort_values('timestamp').set_index('timestamp')
|
|
|
|
def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Robust data cleaning pipeline"""
|
|
# Handle missing values
|
|
df = df.dropna()
|
|
|
|
# Validate price data
|
|
self._validate_prices(df)
|
|
|
|
# Validate trading hours
|
|
df = self._filter_trading_hours(df)
|
|
|
|
# Remove outliers
|
|
df = self._remove_price_outliers(df)
|
|
df = self._remove_volume_outliers(df)
|
|
|
|
# Resample and forward fill missing intervals
|
|
df = df.resample(self.config['resample_freq']).last().ffill()
|
|
|
|
return df
|
|
|
|
def _validate_prices(self, df: pd.DataFrame):
|
|
"""Ensure price data integrity"""
|
|
if (df['close'] <= 0).any():
|
|
bad_values = df[df['close'] <= 0]
|
|
self.logger.error(f"Invalid close prices: {bad_values.index}")
|
|
raise ValueError("Negative/zero close prices detected")
|
|
|
|
if not (df['high'] >= df['low']).all():
|
|
raise ValueError("High prices < Low prices detected")
|
|
|
|
if not df['close'].is_monotonic_increasing:
|
|
self.logger.warning("Non-monotonic timestamps detected")
|
|
|
|
def _filter_trading_hours(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Remove non-market hours (if intraday data)"""
|
|
if pd.infer_freq(df.index) in ('H', 'T'):
|
|
return df.between_time('09:30', '16:00')
|
|
return df
|
|
|
|
def _remove_price_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Remove prices beyond 10 standard deviations"""
|
|
returns = np.log(df['close']).diff().dropna()
|
|
mask = (returns.abs() < 10 * returns.std()).reindex(df.index).ffill()
|
|
return df[mask]
|
|
|
|
def _remove_volume_outliers(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Remove volume spikes beyond 20 standard deviations"""
|
|
log_volume = np.log(df['volume'])
|
|
mask = (log_volume < log_volume.mean() + 20*log_volume.std())
|
|
return df[mask]
|
|
|
|
def process_tickers(self) -> Dict[str, pd.DataFrame]:
|
|
"""Process all tickers with cleaning and resampling"""
|
|
processed = {}
|
|
for ticker in self.config['tickers']:
|
|
try:
|
|
self.logger.info(f"Processing {ticker}")
|
|
df = self._load_raw_data(ticker)
|
|
df = self._clean_data(df)
|
|
processed[ticker] = df
|
|
except Exception as e:
|
|
self.logger.error(f"Failed processing {ticker}: {str(e)}")
|
|
raise
|
|
return processed
|