Entirely new file path. Addd Standard Practices, Moved Docs
This commit is contained in:
184
src/API/API_1.ipynb
Normal file
184
src/API/API_1.ipynb
Normal file
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "69d88f26-f288-4a23-8be5-3e8317e23731",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"ERROR -1 2104 Market data farm connection is OK:usfarm.nj\n",
|
||||
"ERROR -1 2104 Market data farm connection is OK:usfuture\n",
|
||||
"ERROR -1 2104 Market data farm connection is OK:cashfarm\n",
|
||||
"ERROR -1 2104 Market data farm connection is OK:usfarm\n",
|
||||
"ERROR -1 2106 HMDS data farm connection is OK:ushmds\n",
|
||||
"ERROR -1 2158 Sec-def data farm connection is OK:secdefnj\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Historical Data Ended\n",
|
||||
" Date Open High Low Close Volume\n",
|
||||
"0 20241030 18:00:00 69.10 69.10 68.96 69.02 378\n",
|
||||
"1 20241030 18:05:00 69.02 69.07 69.01 69.05 99\n",
|
||||
"2 20241030 18:10:00 69.06 69.07 69.01 69.01 103\n",
|
||||
"3 20241030 18:15:00 69.01 69.02 69.00 69.00 54\n",
|
||||
"4 20241030 18:20:00 69.01 69.01 68.99 69.00 25\n",
|
||||
"5 20241030 18:25:00 69.00 69.05 69.00 69.04 40\n",
|
||||
"6 20241030 18:30:00 69.05 69.05 69.03 69.03 63\n",
|
||||
"7 20241030 18:35:00 69.03 69.03 69.00 69.00 64\n",
|
||||
"8 20241030 18:40:00 68.99 69.01 68.98 68.99 60\n",
|
||||
"9 20241030 18:45:00 68.99 68.99 68.95 68.97 66\n",
|
||||
"10 20241030 18:50:00 68.97 69.00 68.96 68.99 44\n",
|
||||
"11 20241030 18:55:00 68.98 68.98 68.97 68.98 23\n",
|
||||
"12 20241030 19:00:00 68.98 69.02 68.98 69.01 48\n",
|
||||
"13 20241030 19:05:00 69.02 69.03 69.00 69.01 31\n",
|
||||
"14 20241030 19:10:00 69.02 69.02 69.00 69.00 22\n",
|
||||
"15 20241030 19:15:00 69.00 69.00 68.99 68.99 11\n",
|
||||
"16 20241030 19:20:00 68.99 68.99 68.95 68.95 40\n",
|
||||
"17 20241030 19:25:00 68.95 68.95 68.94 68.94 55\n",
|
||||
"18 20241030 19:30:00 68.94 68.96 68.93 68.95 54\n",
|
||||
"19 20241030 19:35:00 68.95 68.97 68.95 68.96 29\n",
|
||||
"20 20241030 19:40:00 68.96 68.98 68.96 68.98 47\n",
|
||||
"21 20241030 19:45:00 68.98 68.99 68.95 68.95 65\n",
|
||||
"22 20241030 19:50:00 68.96 68.98 68.96 68.97 16\n",
|
||||
"23 20241030 19:55:00 68.97 68.97 68.94 68.94 35\n",
|
||||
"24 20241030 20:00:00 68.95 68.99 68.91 68.92 369\n",
|
||||
"25 20241030 20:05:00 68.91 68.94 68.91 68.93 74\n",
|
||||
"26 20241030 20:10:00 68.93 68.95 68.89 68.94 187\n",
|
||||
"27 20241030 20:15:00 68.94 68.95 68.92 68.94 81\n",
|
||||
"28 20241030 20:20:00 68.95 68.97 68.94 68.96 89\n",
|
||||
"29 20241030 20:25:00 68.96 68.96 68.92 68.94 96\n",
|
||||
"30 20241030 20:30:00 68.94 68.98 68.93 68.96 94\n",
|
||||
"31 20241030 20:35:00 68.97 68.97 68.93 68.94 66\n",
|
||||
"32 20241030 20:40:00 68.95 68.95 68.93 68.94 44\n",
|
||||
"33 20241030 20:45:00 68.93 68.96 68.93 68.94 98\n",
|
||||
"34 20241030 20:50:00 68.94 68.94 68.92 68.92 95\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from ibapi.client import EClient\n",
|
||||
"from ibapi.wrapper import EWrapper\n",
|
||||
"from ibapi.contract import Contract\n",
|
||||
"import threading\n",
|
||||
"import time\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Define the IB API app\n",
|
||||
"class IBApi(EWrapper, EClient):\n",
|
||||
" def __init__(self):\n",
|
||||
" EClient.__init__(self, self)\n",
|
||||
" self.data = [] # Initialize an empty list to store data\n",
|
||||
"\n",
|
||||
" # Override the historicalData function to process and store incoming data\n",
|
||||
" def historicalData(self, reqId, bar):\n",
|
||||
" # Append the data as a dictionary to self.data\n",
|
||||
" self.data.append({\n",
|
||||
" \"Date\": bar.date,\n",
|
||||
" \"Open\": bar.open,\n",
|
||||
" \"High\": bar.high,\n",
|
||||
" \"Low\": bar.low,\n",
|
||||
" \"Close\": bar.close,\n",
|
||||
" \"Volume\": bar.volume\n",
|
||||
" })\n",
|
||||
"\n",
|
||||
" def historicalDataEnd(self, reqId, start, end):\n",
|
||||
" print(\"Historical Data Ended\")\n",
|
||||
" # Convert the data to a DataFrame when data collection is complete\n",
|
||||
" self.df = pd.DataFrame(self.data)\n",
|
||||
" print(self.df) # Display the DataFrame to verify\n",
|
||||
" self.disconnect() # Disconnect after data collection is complete\n",
|
||||
"\n",
|
||||
"# Define the app handler for running in the notebook\n",
|
||||
"class IBApp:\n",
|
||||
" def __init__(self):\n",
|
||||
" self.app = IBApi()\n",
|
||||
"\n",
|
||||
" def connect(self):\n",
|
||||
" self.app.connect(\"127.0.0.1\", 7496, 0) # Change port if needed\n",
|
||||
" thread = threading.Thread(target=self.run_app, daemon=True)\n",
|
||||
" thread.start()\n",
|
||||
" time.sleep(1) # Allow time for the connection to establish\n",
|
||||
"\n",
|
||||
" def run_app(self):\n",
|
||||
" self.app.run()\n",
|
||||
"\n",
|
||||
" def request_oil_data(self):\n",
|
||||
" # Define the contract for Crude Oil Futures\n",
|
||||
" contract = Contract()\n",
|
||||
" contract.symbol = \"CL\"\n",
|
||||
" contract.secType = \"FUT\"\n",
|
||||
" contract.exchange = \"NYMEX\"\n",
|
||||
" contract.currency = \"USD\"\n",
|
||||
" contract.lastTradeDateOrContractMonth = \"202412\" # Example: Dec 2024 contract\n",
|
||||
"\n",
|
||||
" # Request historical data\n",
|
||||
" self.app.reqHistoricalData(\n",
|
||||
" reqId=1,\n",
|
||||
" contract=contract,\n",
|
||||
" endDateTime='',\n",
|
||||
" durationStr='1 D', # 1 month\n",
|
||||
" barSizeSetting='5 mins',\n",
|
||||
" whatToShow='TRADES',\n",
|
||||
" useRTH=0,\n",
|
||||
" formatDate=1,\n",
|
||||
" keepUpToDate=False,\n",
|
||||
" chartOptions=[]\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def disconnect(self):\n",
|
||||
" self.app.disconnect()\n",
|
||||
"\n",
|
||||
"# Create an instance and connect\n",
|
||||
"app = IBApp()\n",
|
||||
"app.connect()\n",
|
||||
"\n",
|
||||
"# Request data and output to a DataFrame\n",
|
||||
"app.request_oil_data()\n",
|
||||
"\n",
|
||||
"# Wait for data retrieval to complete\n",
|
||||
"time.sleep(10)\n",
|
||||
"\n",
|
||||
"# Access the DataFrame\n",
|
||||
"df = app.app.df if hasattr(app.app, 'df') else pd.DataFrame()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "2088c621-81d3-46f0-8596-ce05d1a89fd4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = df.to_csv()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
2074
src/API/API_2 (1).ipynb
Normal file
2074
src/API/API_2 (1).ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
src/API/Trading_Bot_Development_Strategy (1).docx
Normal file
BIN
src/API/Trading_Bot_Development_Strategy (1).docx
Normal file
Binary file not shown.
1
src/Data-Collection/.gitignore
vendored
Normal file
1
src/Data-Collection/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
venv/
|
||||
0
src/Data-Collection/WebScraper/README.md
Normal file
0
src/Data-Collection/WebScraper/README.md
Normal file
16
src/Data-Collection/WebScraper/assets/oil_key_words.txt
Normal file
16
src/Data-Collection/WebScraper/assets/oil_key_words.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
oil 5
|
||||
profit 4
|
||||
price 3
|
||||
gas 4
|
||||
energy 5
|
||||
production 3
|
||||
demand 2
|
||||
supply 2
|
||||
barrel 3
|
||||
economy 4
|
||||
investment 3
|
||||
revenue 4
|
||||
loss 2
|
||||
rise 5
|
||||
decline 1
|
||||
|
||||
2471
src/Data-Collection/WebScraper/data/oil_news.json
Normal file
2471
src/Data-Collection/WebScraper/data/oil_news.json
Normal file
File diff suppressed because it is too large
Load Diff
4003
src/Data-Collection/WebScraper/data/preprocessed_oil_news.json
Normal file
4003
src/Data-Collection/WebScraper/data/preprocessed_oil_news.json
Normal file
File diff suppressed because it is too large
Load Diff
47
src/Data-Collection/WebScraper/main.py
Normal file
47
src/Data-Collection/WebScraper/main.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import scrapers.oil_news_scraper as oil_news
|
||||
import scrapers.oil_news_preprocessor as oil_news_preprocessor
|
||||
from tqdm import tqdm
|
||||
|
||||
def show_usage_bar(duration):
|
||||
for _ in tqdm(range(duration), desc="Processing", unit="sec"):
|
||||
time.sleep(1)
|
||||
|
||||
def run_scraper():
|
||||
print("Starting oil data collection with the scraper...")
|
||||
show_usage_bar(0) # Simulated progress bar duration
|
||||
oil_news.run_scraper()
|
||||
print("Oil news data scraping completed.")
|
||||
|
||||
def run_preprocessor():
|
||||
print("Starting oil data collection with the preprocessor...")
|
||||
show_usage_bar(0) # Simulated progress bar duration
|
||||
oil_news_preprocessor.run_preprocessor()
|
||||
print("Oil news data preprocessing completed.")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Oil News Data Collection Tool"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scraper", action="store_true", help="Run the oil news scraper (original code)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preprocessed", action="store_true", help="Run the oil news preprocessor (new code for sentiment analysis)."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.scraper:
|
||||
run_scraper()
|
||||
elif args.preprocessed:
|
||||
run_preprocessor()
|
||||
else:
|
||||
print("No valid option selected. Use '--scraper' to run the scraper or '--preprocessed' to run the preprocessor.")
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
13
src/Data-Collection/WebScraper/main.py.bak
Normal file
13
src/Data-Collection/WebScraper/main.py.bak
Normal file
@@ -0,0 +1,13 @@
|
||||
# main.py
|
||||
import scrapers.oil_news_scraper as oil_news
|
||||
|
||||
def main():
|
||||
print("Starting oil data collection...")
|
||||
|
||||
# Run oil market news scraper
|
||||
oil_news.run_scraper()
|
||||
|
||||
print("Oil news data scraping completed.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(),
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,231 @@
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm # Progress bar
|
||||
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up
|
||||
DATA_DIR = os.path.join(SCRAPER_DIR, "data")
|
||||
KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
|
||||
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data if 'link' in article}
|
||||
|
||||
new_data = []
|
||||
for article in data:
|
||||
if 'link' not in article or article['link'] in existing_links:
|
||||
print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
|
||||
continue
|
||||
new_data.append(article)
|
||||
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Data saved to {file_path}")
|
||||
|
||||
def load_keyword_importance(file_path):
|
||||
keyword_importance = {}
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) == 2:
|
||||
keyword, importance = parts
|
||||
keyword_importance[keyword.lower()] = int(importance)
|
||||
else:
|
||||
print(f"Keyword file not found at {file_path}")
|
||||
return keyword_importance
|
||||
|
||||
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
|
||||
|
||||
def extract_keywords(text, keyword_importance):
|
||||
words = re.findall(r'\b\w+\b', text.lower())
|
||||
keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
|
||||
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
def filter_content(content):
|
||||
"""Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
|
||||
patterns = [
|
||||
r'ADVERTISEMENT',
|
||||
r'Click Here for \d+\+ Global Oil Prices',
|
||||
r'Find us on:',
|
||||
r'Back to homepage',
|
||||
r'Join the discussion',
|
||||
r'More Top Reads From Oilprice.com',
|
||||
r'©OilPrice\.com.*?educational purposes',
|
||||
r'A Media Solutions.*?Oilprice.com',
|
||||
r'\"It\'s most important 8 minute read of my week…\"',
|
||||
r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',
|
||||
r'^.*?DNOW is a supplier.*?,',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
|
||||
content = re.sub(r'\s+', ' ', content).strip()
|
||||
return content
|
||||
|
||||
def extract_author_info(driver, article_soup, headline_pages=1):
|
||||
"""Extract detailed author information from the 'read more' link if available."""
|
||||
author = "Unknown Author"
|
||||
author_bio = ""
|
||||
contributor_since = ""
|
||||
other_articles = []
|
||||
|
||||
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
|
||||
if author_tag:
|
||||
retries = 3 # Set retry limit
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
driver.get(author_tag['href'])
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
|
||||
)
|
||||
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
|
||||
# Extract author's name
|
||||
author_name_tag = bio_soup.find('h1')
|
||||
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||
|
||||
# Extract author's bio description
|
||||
author_bio_tag = bio_soup.find('p')
|
||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||
|
||||
# Extract contributor since date
|
||||
contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
|
||||
if contributor_since_tag:
|
||||
contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
|
||||
|
||||
# Extract headlines of latest articles by the author, limited by `headline_pages`
|
||||
for page in range(1, headline_pages + 1):
|
||||
driver.get(f"{author_tag['href']}Page-{page}.html")
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
|
||||
|
||||
for article in article_tags:
|
||||
other_articles.append(article.get_text(strip=True))
|
||||
|
||||
break # Break loop if successful
|
||||
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
|
||||
time.sleep(2) # Wait before retrying
|
||||
if attempt == retries - 1:
|
||||
print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
|
||||
|
||||
return {
|
||||
"name": author,
|
||||
"bio": author_bio,
|
||||
"contributor_since": contributor_since,
|
||||
"other_articles": other_articles
|
||||
}
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil news articles for sentiment analysis...")
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 1
|
||||
total_articles = 0
|
||||
|
||||
while page_number <= max_pages:
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except:
|
||||
break
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
total_articles += len(soup.find_all('div', class_='categoryArticle'))
|
||||
page_number += 1
|
||||
|
||||
page_number = 1
|
||||
with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
|
||||
while page_number <= max_pages:
|
||||
print(f"\nProcessing page {page_number}...")
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
break
|
||||
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link_tag = article.find('a', href=True)
|
||||
link = link_tag['href'] if link_tag else None
|
||||
date_meta = article.find('p', class_='categoryArticle__meta')
|
||||
date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
|
||||
|
||||
content = ""
|
||||
if link:
|
||||
print(f"Fetching article: {link}")
|
||||
driver.get(link)
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
|
||||
)
|
||||
article_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
|
||||
content = filter_content(raw_content)
|
||||
author, author_bio = extract_author_info(driver, article_soup)
|
||||
except:
|
||||
print(f"Error: Content did not load for article {headline}.")
|
||||
|
||||
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
|
||||
|
||||
if headline and link and date:
|
||||
author_info = extract_author_info(driver, article_soup, headline_pages=1)
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'content': content,
|
||||
'date': date,
|
||||
'author': author_info['name'],
|
||||
'author_bio': author_info['bio'],
|
||||
'contributor_since': author_info['contributor_since'],
|
||||
'other_articles': author_info['other_articles'],
|
||||
'keywords': extracted_keywords,
|
||||
})
|
||||
|
||||
pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
|
||||
pbar.update(1)
|
||||
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_preprocessor():
|
||||
file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_preprocessor()
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
"""Load existing data from JSON file to avoid duplicates."""
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
"""Save scraped data to a JSON file, ensuring no duplicates."""
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data}
|
||||
|
||||
new_data = [article for article in data if article['link'] not in existing_links]
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Simple function to extract keywords from text."""
|
||||
keywords = re.findall(r'\b\w+\b', text.lower())
|
||||
return list(set(keywords))[:10] # Return the first 10 unique keywords
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news using Selenium...")
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 10 # Limit to 10 pages
|
||||
|
||||
while page_number <= max_pages:
|
||||
# Load the page with pagination
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: Content did not load properly on page {page_number}.")
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
print(f"No articles found on page {page_number}. Ending pagination.")
|
||||
break
|
||||
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
|
||||
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
|
||||
timestamp = date.split('|')[0].strip() if '|' in date else date
|
||||
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'date': timestamp,
|
||||
'author': author,
|
||||
'excerpt': excerpt,
|
||||
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
|
||||
'sentiment_analysis': None # Placeholder for future sentiment analysis
|
||||
})
|
||||
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_scraper():
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
251
src/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
Normal file
251
src/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
Normal file
@@ -0,0 +1,251 @@
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm # Progress bar
|
||||
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__)) # One level up
|
||||
DATA_DIR = os.path.join(SCRAPER_DIR, "data")
|
||||
KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
|
||||
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data if 'link' in article}
|
||||
|
||||
new_data = []
|
||||
for article in data:
|
||||
if 'link' not in article or article['link'] in existing_links:
|
||||
print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
|
||||
continue
|
||||
new_data.append(article)
|
||||
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Data saved to {file_path}")
|
||||
|
||||
def load_keyword_importance(file_path):
|
||||
keyword_importance = {}
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) == 2:
|
||||
keyword, importance = parts
|
||||
keyword_importance[keyword.lower()] = int(importance)
|
||||
else:
|
||||
print(f"Keyword file not found at {file_path}")
|
||||
return keyword_importance
|
||||
|
||||
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
|
||||
|
||||
def extract_keywords(text, keyword_importance):
|
||||
words = re.findall(r'\b\w+\b', text.lower())
|
||||
keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
|
||||
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
def filter_content(content):
|
||||
"""Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
|
||||
patterns = [
|
||||
r'ADVERTISEMENT',
|
||||
r'Click Here for \d+\+ Global Oil Prices',
|
||||
r'Find us on:',
|
||||
r'Back to homepage',
|
||||
r'Join the discussion',
|
||||
r'More Top Reads From Oilprice.com',
|
||||
r'©OilPrice\.com.*?educational purposes',
|
||||
r'A Media Solutions.*?Oilprice.com',
|
||||
r'\"It\'s most important 8 minute read of my week…\"',
|
||||
r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',
|
||||
r'^.*?DNOW is a supplier.*?,',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
content = re.sub(pattern, '', content, flags=re.IGNORECASE)
|
||||
content = re.sub(r'\s+', ' ', content).strip()
|
||||
return content
|
||||
|
||||
def scrape_author_info(driver, author_url, headline_pages=1):
|
||||
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
|
||||
author_name = "Unknown"
|
||||
author_bio = ""
|
||||
contributor_since = ""
|
||||
other_articles = []
|
||||
|
||||
try:
|
||||
# Load author page
|
||||
driver.get(author_url)
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Extract author name
|
||||
author_name_tag = bio_soup.find('h1')
|
||||
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||
|
||||
# Extract author bio
|
||||
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||
|
||||
# Extract contributor since date
|
||||
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||
|
||||
# Extract latest articles by author with heading, excerpt, keywords, and timestamp
|
||||
for page in range(1, headline_pages + 1):
|
||||
driver.get(f"{author_url}/Page-{page}.html")
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||
)
|
||||
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
article_tags = page_soup.find_all('li', class_='clear')
|
||||
|
||||
for article in article_tags:
|
||||
heading_tag = article.find('h3')
|
||||
excerpt_tag = article.find('p', class_='articlecontent')
|
||||
timestamp_tag = article.find('div', class_='meta')
|
||||
|
||||
if heading_tag and excerpt_tag and timestamp_tag:
|
||||
heading = heading_tag.get_text(strip=True)
|
||||
excerpt = filter_content(excerpt_tag.get_text(strip=True)) # Use filter_content
|
||||
timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
|
||||
keywords = [keyword for keyword, _ in extract_keywords(excerpt, keyword_importance)]
|
||||
|
||||
other_articles.append({
|
||||
"heading": heading,
|
||||
"excerpt": excerpt,
|
||||
"keywords": keywords,
|
||||
"published_date": timestamp
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping author info: {e}")
|
||||
author_name = "Error Occurred"
|
||||
author_bio = str(e)
|
||||
contributor_since = "N/A"
|
||||
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
|
||||
|
||||
return {
|
||||
"name": author_name,
|
||||
"bio": author_bio,
|
||||
"contributor_since": contributor_since,
|
||||
"other_articles": other_articles
|
||||
}
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil news articles for sentiment analysis...")
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 1
|
||||
total_articles = 0
|
||||
|
||||
while page_number <= max_pages:
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except:
|
||||
break
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
total_articles += len(soup.find_all('div', class_='categoryArticle'))
|
||||
page_number += 1
|
||||
|
||||
page_number = 1
|
||||
with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
|
||||
while page_number <= max_pages:
|
||||
print(f"\nProcessing page {page_number}...")
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
break
|
||||
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link_tag = article.find('a', href=True)
|
||||
link = link_tag['href'] if link_tag else None
|
||||
date_meta = article.find('p', class_='categoryArticle__meta')
|
||||
date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
|
||||
|
||||
content = ""
|
||||
if link:
|
||||
print(f"Fetching article: {link}")
|
||||
driver.get(link)
|
||||
try:
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
|
||||
)
|
||||
article_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
|
||||
content = filter_content(raw_content)
|
||||
|
||||
# Fetch author info using scrape_author_info
|
||||
author_url = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))['href']
|
||||
author_info = scrape_author_info(driver, author_url, headline_pages=1)
|
||||
|
||||
except:
|
||||
print(f"Error: Content did not load for article {headline}.")
|
||||
author_info = {
|
||||
"name": "Unknown",
|
||||
"bio": "",
|
||||
"contributor_since": "",
|
||||
"other_articles": []
|
||||
}
|
||||
|
||||
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
|
||||
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'content': content,
|
||||
'date': date,
|
||||
'author': author_info['name'],
|
||||
'author_bio': author_info['bio'],
|
||||
'contributor_since': author_info['contributor_since'],
|
||||
'other_articles': author_info['other_articles'],
|
||||
'keywords': extracted_keywords,
|
||||
})
|
||||
|
||||
pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
|
||||
pbar.update(1)
|
||||
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_preprocessor():
|
||||
file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_preprocessor()
|
||||
|
||||
143
src/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
Normal file
143
src/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
Normal file
@@ -0,0 +1,143 @@
|
||||
import json
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
import time
|
||||
import re
|
||||
|
||||
OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
|
||||
DATA_DIR = os.path.join(os.getcwd(), "data")
|
||||
KEYWORD_FILE_PATH = os.path.join(os.getcwd(), "assets", "oil_key_words.txt")
|
||||
|
||||
if not os.path.exists(DATA_DIR):
|
||||
os.makedirs(DATA_DIR)
|
||||
|
||||
def load_existing_data(file_path):
|
||||
"""Load existing data from JSON file to avoid duplicates."""
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
return []
|
||||
|
||||
def save_to_json(data, file_path):
|
||||
"""Save scraped data to a JSON file, ensuring no duplicates."""
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data}
|
||||
|
||||
new_data = []
|
||||
for article in data:
|
||||
if article['link'] in existing_links:
|
||||
print(f"Skipping duplicate article: {article['headline']}")
|
||||
continue
|
||||
new_data.append(article)
|
||||
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
|
||||
def load_keyword_importance(file_path):
|
||||
"""Load keyword importance values from the oil_key_words.txt file."""
|
||||
keyword_importance = {}
|
||||
if os.path.exists(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if len(parts) == 2:
|
||||
keyword, importance = parts
|
||||
keyword_importance[keyword.lower()] = int(importance)
|
||||
else:
|
||||
print(f"Keyword file not found at {file_path}")
|
||||
return keyword_importance
|
||||
|
||||
keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
|
||||
|
||||
def extract_keywords(text, keyword_importance):
|
||||
"""Extract important keywords from text based on an external keyword list."""
|
||||
words = re.findall(r'\b\w+\b', text.lower())
|
||||
keywords = {}
|
||||
|
||||
for word in words:
|
||||
if len(word) > 3 and word in keyword_importance:
|
||||
keywords[word] = keyword_importance[word] # Store keyword with its importance
|
||||
|
||||
# Return up to 10 unique keywords with their importance
|
||||
return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
|
||||
def analyze_sentiment(text):
|
||||
"""Basic sentiment analysis placeholder with minimal processing."""
|
||||
# Only check for specific keywords; avoid complex logic to save time
|
||||
if "profit" in text or "rise" in text:
|
||||
return "Positive"
|
||||
elif "loss" in text or "decline" in text:
|
||||
return "Negative"
|
||||
else:
|
||||
return "Neutral"
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news using Selenium...")
|
||||
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
news_data = []
|
||||
page_number = 1
|
||||
max_pages = 10 # Limit to 10 pages
|
||||
|
||||
while page_number <= max_pages:
|
||||
print(f"Processing page {page_number}...")
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
|
||||
try:
|
||||
WebDriverWait(driver, 20).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error: Content did not load properly on page {page_number}.")
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
|
||||
articles = soup.find_all('div', class_='categoryArticle')
|
||||
if not articles:
|
||||
print(f"No articles found on page {page_number}. Ending pagination.")
|
||||
break
|
||||
|
||||
for article in articles:
|
||||
headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
|
||||
link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
|
||||
date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
|
||||
excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
|
||||
author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
|
||||
timestamp = date.split('|')[0].strip() if '|' in date else date
|
||||
extracted_keywords = extract_keywords(headline + " " + excerpt if excerpt else headline, keyword_importance)
|
||||
|
||||
if headline and link and date:
|
||||
news_data.append({
|
||||
'headline': headline,
|
||||
'link': link,
|
||||
'date': timestamp,
|
||||
'author': author,
|
||||
'excerpt': excerpt,
|
||||
'keywords': extracted_keywords,
|
||||
'sentiment_analysis': None
|
||||
#'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
|
||||
})
|
||||
|
||||
page_number += 1
|
||||
time.sleep(2)
|
||||
|
||||
driver.quit()
|
||||
return news_data
|
||||
|
||||
def run_scraper():
|
||||
file_path = os.path.join(DATA_DIR, 'oil_news.json')
|
||||
news_data = scrape_oil_news()
|
||||
save_to_json(news_data, file_path)
|
||||
|
||||
347
src/Data-Collection/WebScraper/scrapers/tests/author_info.json
Normal file
347
src/Data-Collection/WebScraper/scrapers/tests/author_info.json
Normal file
@@ -0,0 +1,347 @@
|
||||
{
|
||||
"name": "Charles Kennedy",
|
||||
"bio": "Charles is a writer for Oilprice.com",
|
||||
"contributor_since": "29 Sep 2011",
|
||||
"other_articles": [
|
||||
{
|
||||
"heading": "Record Shale Production Helps ConocoPhillips Beat Profit Estimates",
|
||||
"excerpt": "ConocoPhillips (NYSE: COP) is raising its ordinary dividend and share buyback program as its third-quarter earnings beat market expectations on the back of higher total…",
|
||||
"keywords": [
|
||||
"share",
|
||||
"market",
|
||||
"higher",
|
||||
"back",
|
||||
"total",
|
||||
"expectations",
|
||||
"third",
|
||||
"beat",
|
||||
"raising",
|
||||
"conocophillips"
|
||||
],
|
||||
"published_date": "31 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Rosneft to Resume Output at Idled Black Sea Refinery in November",
|
||||
"excerpt": "Rosneft plans to resume crude processing at its Tuapse oil refinery on Russia’s Black Sea coast in November, after idling it for a month because…",
|
||||
"keywords": [
|
||||
"processing",
|
||||
"idling",
|
||||
"russia",
|
||||
"plans",
|
||||
"rosneft",
|
||||
"refinery",
|
||||
"tuapse",
|
||||
"crude",
|
||||
"november",
|
||||
"black"
|
||||
],
|
||||
"published_date": "31 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Canadian Natural Resources Q3 Profit Slips as Oil and Gas Prices Fall",
|
||||
"excerpt": "Canada’s largest oil and gas producer, Canadian Natural Resources (NYSE: CNQ), reported lower adjusted net earnings from operations for the third quarter compared to a…",
|
||||
"keywords": [
|
||||
"canada",
|
||||
"operations",
|
||||
"producer",
|
||||
"resources",
|
||||
"reported",
|
||||
"canadian",
|
||||
"largest",
|
||||
"third",
|
||||
"natural",
|
||||
"nyse"
|
||||
],
|
||||
"published_date": "31 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Exelon Reports 80% Surge in Data Center Power Supply Deals",
|
||||
"excerpt": "Exelon has seen an 80% increase in power supply deals coming from data enter operators in the latest sign that the IT industry is driving…",
|
||||
"keywords": [
|
||||
"industry",
|
||||
"data",
|
||||
"driving",
|
||||
"seen",
|
||||
"power",
|
||||
"increase",
|
||||
"exelon",
|
||||
"deals",
|
||||
"sign",
|
||||
"that"
|
||||
],
|
||||
"published_date": "31 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion",
|
||||
"excerpt": "Gazprom is raising its investment plan for 2024 by 4% to $16.9 billion (1.642 trillion Russian rubles), thanks to rising exports and domestic supply, the…",
|
||||
"keywords": [
|
||||
"investment",
|
||||
"russian",
|
||||
"rubles",
|
||||
"plan",
|
||||
"exports",
|
||||
"billion",
|
||||
"raising",
|
||||
"thanks",
|
||||
"trillion",
|
||||
"supply"
|
||||
],
|
||||
"published_date": "30 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Investment Giants Form $50-Billion AI and Power Partnership",
|
||||
"excerpt": "Global investment firm KKR and private-equity giant Energy Capital Partners on Wednesday announced a $50 billion strategic partnership to invest in data centers and power…",
|
||||
"keywords": [
|
||||
"centers",
|
||||
"strategic",
|
||||
"investment",
|
||||
"giant",
|
||||
"energy",
|
||||
"capital",
|
||||
"private",
|
||||
"wednesday",
|
||||
"billion",
|
||||
"data"
|
||||
],
|
||||
"published_date": "30 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE",
|
||||
"excerpt": "Vietnam’s electric vehicle manufacturer VinFast Auto is expected to receive at least $1 billion in overseas funding led by Emirates Driving Company (EDC), Abu Dhabi’s…",
|
||||
"keywords": [
|
||||
"overseas",
|
||||
"manufacturer",
|
||||
"vietnam",
|
||||
"expected",
|
||||
"billion",
|
||||
"driving",
|
||||
"emirates",
|
||||
"funding",
|
||||
"receive",
|
||||
"least"
|
||||
],
|
||||
"published_date": "30 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Chinese Oil Major to Explore Iraqi Field",
|
||||
"excerpt": "China’s CNOOC has inked a deal for exploration at an oil field in central Iraq, the company said today.\nThe deposit, Block 7, will be…",
|
||||
"keywords": [
|
||||
"deposit",
|
||||
"cnooc",
|
||||
"iraq",
|
||||
"field",
|
||||
"central",
|
||||
"deal",
|
||||
"today",
|
||||
"said",
|
||||
"china",
|
||||
"inked"
|
||||
],
|
||||
"published_date": "30 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "TotalEnergies to Produce More Gas Condensate Offshore Denmark",
|
||||
"excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
|
||||
"keywords": [
|
||||
"phillips",
|
||||
"refining",
|
||||
"giant",
|
||||
"than",
|
||||
"expected",
|
||||
"higher",
|
||||
"year",
|
||||
"plunged",
|
||||
"third",
|
||||
"even"
|
||||
],
|
||||
"published_date": "29 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3",
|
||||
"excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
|
||||
"keywords": [
|
||||
"phillips",
|
||||
"refining",
|
||||
"giant",
|
||||
"than",
|
||||
"expected",
|
||||
"higher",
|
||||
"year",
|
||||
"plunged",
|
||||
"third",
|
||||
"even"
|
||||
],
|
||||
"published_date": "29 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "UK Offshore Oil Platform Halted Due to Gas Compressor Issue",
|
||||
"excerpt": "Production via the Triton Floating Production Storage & Offloading (FPSO) vessel in the UK North Sea has been halted due to a problem with the…",
|
||||
"keywords": [
|
||||
"fpso",
|
||||
"been",
|
||||
"with",
|
||||
"problem",
|
||||
"halted",
|
||||
"storage",
|
||||
"triton",
|
||||
"vessel",
|
||||
"offloading",
|
||||
"north"
|
||||
],
|
||||
"published_date": "29 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "UAE’s Renewable Energy Giant Pushes Back Green Hydrogen Targets",
|
||||
"excerpt": "Masdar, the clean energy giant of the United Arab Emirates (UAE), has pushed back its target to reach 1 million tons per year of green…",
|
||||
"keywords": [
|
||||
"united",
|
||||
"energy",
|
||||
"giant",
|
||||
"emirates",
|
||||
"back",
|
||||
"year",
|
||||
"million",
|
||||
"arab",
|
||||
"pushed",
|
||||
"target"
|
||||
],
|
||||
"published_date": "28 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Profit at India’s Top Refiner Slumps by 99% Due to Weak Margins",
|
||||
"excerpt": "IndianOil, the biggest refiner in India, reported on Monday a net profit tumbling by 98.6% in the quarter to September from a year ago amid…",
|
||||
"keywords": [
|
||||
"refiner",
|
||||
"monday",
|
||||
"september",
|
||||
"biggest",
|
||||
"reported",
|
||||
"indianoil",
|
||||
"india",
|
||||
"year",
|
||||
"tumbling",
|
||||
"profit"
|
||||
],
|
||||
"published_date": "28 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Average U.S. Gasoline Price Set to Drop Below $3 for the First Time Since 2021",
|
||||
"excerpt": "The U.S. national average price of gasoline is set to soon fall below $3 per gallon for the first time since 2021, amid lower seasonal…",
|
||||
"keywords": [
|
||||
"gasoline",
|
||||
"national",
|
||||
"below",
|
||||
"gallon",
|
||||
"soon",
|
||||
"first",
|
||||
"lower",
|
||||
"average",
|
||||
"seasonal",
|
||||
"price"
|
||||
],
|
||||
"published_date": "28 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "FERC Grants Exxon and Qatar Three-Year Extension to Build Golden Pass LNG",
|
||||
"excerpt": "The U.S. Federal Energy Regulatory Commission has granted a three-year extension to ExxonMobil and QatarEnergy to build their $10-billion Golden Pass LNG export plant in…",
|
||||
"keywords": [
|
||||
"federal",
|
||||
"export",
|
||||
"three",
|
||||
"energy",
|
||||
"golden",
|
||||
"billion",
|
||||
"year",
|
||||
"their",
|
||||
"qatarenergy",
|
||||
"regulatory"
|
||||
],
|
||||
"published_date": "25 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Cepsa: Windfall Tax Would Delay Its $3.3-Billion Hydrogen Plan",
|
||||
"excerpt": "Cepsa, Spain’s second-largest oil company, will delay its $3.25 billion (3 billion euros) investment into domestic green hydrogen projects if Spain makes the windfall tax…",
|
||||
"keywords": [
|
||||
"investment",
|
||||
"second",
|
||||
"projects",
|
||||
"billion",
|
||||
"euros",
|
||||
"largest",
|
||||
"into",
|
||||
"delay",
|
||||
"will",
|
||||
"cepsa"
|
||||
],
|
||||
"published_date": "25 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "South Africa Seeks Loan Guarantees for Energy Transition Funding",
|
||||
"excerpt": "South Africa is currently negotiating loan guarantees with its international partners in its $9.3-billion Just Energy Transition Partnership (JETP) program for energy investment.\nThe International…",
|
||||
"keywords": [
|
||||
"jetp",
|
||||
"negotiating",
|
||||
"energy",
|
||||
"transition",
|
||||
"currently",
|
||||
"investment",
|
||||
"billion",
|
||||
"south",
|
||||
"africa",
|
||||
"guarantees"
|
||||
],
|
||||
"published_date": "25 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Saudi Oil Export Revenues Hit Three-Year Low as Prices Decline",
|
||||
"excerpt": "Lower crude oil prices dragged Saudi Arabia’s oil export revenues to the lowest level in more than three years in August, amid underwhelming oil demand…",
|
||||
"keywords": [
|
||||
"years",
|
||||
"three",
|
||||
"august",
|
||||
"than",
|
||||
"more",
|
||||
"dragged",
|
||||
"revenues",
|
||||
"saudi",
|
||||
"crude",
|
||||
"prices"
|
||||
],
|
||||
"published_date": "24 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Tesla Stock Soars After Q3 Earnings Beat",
|
||||
"excerpt": "Tesla (NASDAQ: TSLA) saw its shares jump by 20% after hours on Wednesday and another 14% in pre-market trade on Thursday after reporting earnings for…",
|
||||
"keywords": [
|
||||
"thursday",
|
||||
"after",
|
||||
"trade",
|
||||
"market",
|
||||
"tesla",
|
||||
"wednesday",
|
||||
"another",
|
||||
"nasdaq",
|
||||
"hours",
|
||||
"reporting"
|
||||
],
|
||||
"published_date": "24 October 2024"
|
||||
},
|
||||
{
|
||||
"heading": "Oil Refining Giant Valero Tops Estimates Despite Q3 Profit Plunge",
|
||||
"excerpt": "One of the biggest U.S. refiners, Valero Energy (NYSE: VLO), beat Wall Street estimates even as it reported a widely expected plunge in its third-quarter…",
|
||||
"keywords": [
|
||||
"street",
|
||||
"energy",
|
||||
"biggest",
|
||||
"wall",
|
||||
"reported",
|
||||
"expected",
|
||||
"plunge",
|
||||
"widely",
|
||||
"third",
|
||||
"valero"
|
||||
],
|
||||
"published_date": "24 October 2024"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
|
||||
OUTPUT_FILE = "author_info.json"
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Basic keyword extraction by finding unique words longer than 3 characters."""
|
||||
words = re.findall(r'\b\w{4,}\b', text.lower())
|
||||
keywords = list(set(words))
|
||||
return keywords[:10] # Limit to top 10 unique keywords for simplicity
|
||||
|
||||
def scrape_author_info(author_url, headline_pages=1):
|
||||
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
author_name = "Unknown"
|
||||
author_bio = ""
|
||||
contributor_since = ""
|
||||
other_articles = []
|
||||
|
||||
try:
|
||||
# Load author page
|
||||
driver.get(author_url)
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Extract author name
|
||||
author_name_tag = bio_soup.find('h1')
|
||||
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||
|
||||
# Extract author bio
|
||||
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||
|
||||
# Extract contributor since date
|
||||
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||
|
||||
# Extract latest articles by author with heading, excerpt, keywords, and timestamp
|
||||
for page in range(1, headline_pages + 1):
|
||||
driver.get(f"{author_url}/Page-{page}.html")
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||
)
|
||||
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
article_tags = page_soup.find_all('li', class_='clear')
|
||||
|
||||
for article in article_tags:
|
||||
heading_tag = article.find('h3')
|
||||
excerpt_tag = article.find('p', class_='articlecontent')
|
||||
timestamp_tag = article.find('div', class_='meta')
|
||||
|
||||
if heading_tag and excerpt_tag and timestamp_tag:
|
||||
heading = heading_tag.get_text(strip=True)
|
||||
excerpt = excerpt_tag.get_text(strip=True)
|
||||
timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
|
||||
keywords = extract_keywords(excerpt)
|
||||
|
||||
other_articles.append({
|
||||
"heading": heading,
|
||||
"excerpt": excerpt,
|
||||
"keywords": keywords,
|
||||
"published_date": timestamp
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping author info: {e}")
|
||||
author_name = "Error Occurred"
|
||||
author_bio = str(e)
|
||||
contributor_since = "N/A"
|
||||
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
return {
|
||||
"name": author_name,
|
||||
"bio": author_bio,
|
||||
"contributor_since": contributor_since,
|
||||
"other_articles": other_articles
|
||||
}
|
||||
|
||||
def save_to_json(data, output_file):
|
||||
"""Save author info to a JSON file."""
|
||||
with open(output_file, mode="w", encoding="utf-8") as file:
|
||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"Author info saved to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Scrape author info
|
||||
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
|
||||
|
||||
# Save to JSON
|
||||
save_to_json(author_info, OUTPUT_FILE)
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
|
||||
OUTPUT_FILE = "author_info.json"
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Basic keyword extraction by finding unique words longer than 3 characters."""
|
||||
words = re.findall(r'\b\w{4,}\b', text.lower())
|
||||
keywords = list(set(words))
|
||||
return keywords[:10] # Limit to top 10 unique keywords for simplicity
|
||||
|
||||
def scrape_author_info(author_url, headline_pages=1):
|
||||
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
author_name = "Unknown"
|
||||
author_bio = ""
|
||||
contributor_since = ""
|
||||
other_articles = []
|
||||
|
||||
try:
|
||||
# Load author page
|
||||
driver.get(author_url)
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Extract author name
|
||||
author_name_tag = bio_soup.find('h1')
|
||||
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||
|
||||
# Extract author bio
|
||||
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||
|
||||
# Extract contributor since date
|
||||
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||
|
||||
# Extract latest articles by author with heading, excerpt, and keywords
|
||||
for page in range(1, headline_pages + 1):
|
||||
driver.get(f"{author_url}/Page-{page}.html")
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||
)
|
||||
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
article_tags = page_soup.find_all('li', class_='clear')
|
||||
|
||||
for article in article_tags:
|
||||
heading_tag = article.find('h3')
|
||||
excerpt_tag = article.find('p', class_='articlecontent')
|
||||
|
||||
if heading_tag and excerpt_tag:
|
||||
heading = heading_tag.get_text(strip=True)
|
||||
excerpt = excerpt_tag.get_text(strip=True)
|
||||
keywords = extract_keywords(excerpt)
|
||||
|
||||
other_articles.append({
|
||||
"heading": heading,
|
||||
"excerpt": excerpt,
|
||||
"keywords": keywords
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping author info: {e}")
|
||||
author_name = "Error Occurred"
|
||||
author_bio = str(e)
|
||||
contributor_since = "N/A"
|
||||
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
return {
|
||||
"name": author_name,
|
||||
"bio": author_bio,
|
||||
"contributor_since": contributor_since,
|
||||
"other_articles": other_articles
|
||||
}
|
||||
|
||||
def save_to_json(data, output_file):
|
||||
"""Save author info to a JSON file."""
|
||||
with open(output_file, mode="w", encoding="utf-8") as file:
|
||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"Author info saved to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Scrape author info
|
||||
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
|
||||
|
||||
# Save to JSON
|
||||
save_to_json(author_info, OUTPUT_FILE)
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
import time
|
||||
|
||||
# Provide the path to your geckodriver executable using the Service class
|
||||
service = Service(executable_path='/usr/local/bin/geckodriver')
|
||||
driver = webdriver.Firefox(service=service)
|
||||
|
||||
# Open a website (e.g., OilPrice.com)
|
||||
driver.get("https://oilprice.com/Latest-Energy-News/World-News/")
|
||||
|
||||
# Wait for the page to load
|
||||
time.sleep(5)
|
||||
|
||||
# Print the title of the page to verify that it's loaded
|
||||
print(driver.title)
|
||||
|
||||
# Find and print some element on the page, e.g., all article titles
|
||||
articles = driver.find_elements(By.CSS_SELECTOR, "div.categoryArticle")
|
||||
for article in articles:
|
||||
title = article.find_element(By.TAG_NAME, "a").text
|
||||
print(f"Article title: {title}")
|
||||
|
||||
# Close the browser
|
||||
driver.quit()
|
||||
0
src/Data-Collection/WebScraper/setup.py
Normal file
0
src/Data-Collection/WebScraper/setup.py
Normal file
246
src/GUSHTradingBotV1.0.py
Normal file
246
src/GUSHTradingBotV1.0.py
Normal file
@@ -0,0 +1,246 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import yfinance as yf
|
||||
from scipy.optimize import minimize
|
||||
|
||||
|
||||
def ticker_info():
|
||||
ticker = "gush"
|
||||
return ticker.upper()
|
||||
|
||||
|
||||
def fetch_expiration_dates(ticker):
|
||||
print(f"Fetching available expiration dates for {ticker}...")
|
||||
stock = yf.Ticker(ticker)
|
||||
expiration_dates = stock.options
|
||||
print(f"Available expiration dates: {expiration_dates}")
|
||||
return expiration_dates
|
||||
|
||||
|
||||
def select_expiration_date(expiration_dates):
|
||||
print("Selecting the first available expiration date...")
|
||||
expiration_date = expiration_dates[0]
|
||||
print(f"Selected expiration date: {expiration_date}")
|
||||
return expiration_date
|
||||
|
||||
|
||||
def fetch_option_chain(ticker, expiration_date):
|
||||
print(f"Fetching option chain for {ticker} with expiration date {expiration_date}...")
|
||||
stock = yf.Ticker(ticker)
|
||||
options_chain = stock.option_chain(expiration_date)
|
||||
print("Option chain fetched successfully!")
|
||||
return options_chain
|
||||
|
||||
|
||||
def get_price_data(ticker, start_date, end_date):
|
||||
print(f"Fetching price data for {ticker} from {start_date} to {end_date}...")
|
||||
data = yf.download(ticker, start=start_date, end=end_date)
|
||||
print(f"Price data fetched successfully for {ticker}!")
|
||||
return data
|
||||
|
||||
|
||||
def moving_average_strategy(data, short_window=20, long_window=50):
|
||||
data['Short_MA'] = data['Close'].rolling(window=short_window).mean()
|
||||
data['Long_MA'] = data['Close'].rolling(window=long_window).mean()
|
||||
data['Signal'] = np.where(data['Short_MA'] > data['Long_MA'], 1, -1)
|
||||
return data['Signal']
|
||||
|
||||
def rsi_strategy(data, window=14, overbought=70, oversold=30):
|
||||
delta = data['Close'].diff(1)
|
||||
gain = np.where(delta > 0, delta, 0).flatten() # Flatten to 1D array
|
||||
loss = np.where(delta < 0, abs(delta), 0).flatten() # Flatten to 1D array
|
||||
|
||||
avg_gain = pd.Series(gain).rolling(window=window).mean()
|
||||
avg_loss = pd.Series(loss).rolling(window=window).mean()
|
||||
|
||||
# Avoid division by zero by using np.where to replace 0 with np.nan in avg_loss
|
||||
rs = avg_gain / np.where(avg_loss == 0, np.nan, avg_loss)
|
||||
|
||||
rsi = 100 - (100 / (1 + rs))
|
||||
|
||||
signal = np.where(rsi < oversold, 1, np.where(rsi > overbought, -1, 0))
|
||||
return pd.Series(signal, index=data.index)
|
||||
|
||||
def bollinger_bands_strategy(data, window=20, num_std=2):
|
||||
# Calculate moving average
|
||||
data['Moving_Avg'] = data['Close'].rolling(window=window).mean()
|
||||
|
||||
# Calculate rolling standard deviation and force it to be a Series
|
||||
rolling_std = data['Close'].rolling(window).std()
|
||||
rolling_std = rolling_std.squeeze() # Ensure rolling_std is a Series
|
||||
|
||||
# Print shapes for debugging
|
||||
print(f"Shape of Moving_Avg: {data['Moving_Avg'].shape}")
|
||||
print(f"Shape of Rolling Std: {rolling_std.shape}")
|
||||
|
||||
# Calculate upper and lower bands
|
||||
data['Band_Upper'] = data['Moving_Avg'] + (num_std * rolling_std)
|
||||
data['Band_Lower'] = data['Moving_Avg'] - (num_std * rolling_std)
|
||||
|
||||
# Print shapes after assignments for debugging
|
||||
print(f"Shape of Band_Upper: {data['Band_Upper'].shape}")
|
||||
print(f"Shape of Band_Lower: {data['Band_Lower'].shape}")
|
||||
|
||||
# Check for NaN values
|
||||
print(f"NaNs in Close: {data['Close'].isna().sum()}")
|
||||
print(f"NaNs in Band_Upper: {data['Band_Upper'].isna().sum()}")
|
||||
print(f"NaNs in Band_Lower: {data['Band_Lower'].isna().sum()}")
|
||||
|
||||
# Print the columns of the DataFrame
|
||||
print(f"Columns in data before dropping NaNs: {data.columns.tolist()}")
|
||||
|
||||
# Optionally drop rows with NaNs
|
||||
data = data.dropna(subset=['Close', 'Band_Upper', 'Band_Lower'])
|
||||
|
||||
# Generate signals based on the bands
|
||||
signal = np.where(data['Close'] < data['Band_Lower'], 1,
|
||||
np.where(data['Close'] > data['Band_Upper'], -1, 0))
|
||||
|
||||
return pd.Series(signal, index=data.index)
|
||||
|
||||
def generate_signals(data):
|
||||
ma_signal = moving_average_strategy(data)
|
||||
rsi_signal = rsi_strategy(data)
|
||||
bollinger_signal = bollinger_bands_strategy(data)
|
||||
return pd.DataFrame({'MA': ma_signal, 'RSI': rsi_signal, 'Bollinger': bollinger_signal})
|
||||
|
||||
|
||||
def backtest_option_trades(option_chain, signals, stock_data):
|
||||
"""
|
||||
Backtest option trades based on the given signals and stock data.
|
||||
"""
|
||||
trades = []
|
||||
current_position = None
|
||||
|
||||
# Ensure both stock_data and option_chain indices are sorted in ascending order
|
||||
stock_data = stock_data.sort_index()
|
||||
|
||||
# Convert 'lastTradeDate' or any date-related columns to datetime in option_chain
|
||||
if 'lastTradeDate' in option_chain.columns:
|
||||
option_chain['lastTradeDate'] = pd.to_datetime(option_chain['lastTradeDate'])
|
||||
option_chain = option_chain.set_index('lastTradeDate')
|
||||
|
||||
# If option_chain index isn't datetime, convert it to datetime (ensuring compatibility)
|
||||
option_chain.index = pd.to_datetime(option_chain.index)
|
||||
|
||||
# Remove the timezone from option_chain index
|
||||
option_chain.index = option_chain.index.tz_localize(None)
|
||||
|
||||
# Now reindex the option chain to match the stock data index (forward fill missing option prices)
|
||||
option_chain = option_chain.sort_index()
|
||||
option_chain = option_chain.reindex(stock_data.index, method='ffill')
|
||||
|
||||
for i in range(len(signals)):
|
||||
if signals.iloc[i]['MA'] == 1 and current_position is None:
|
||||
# BUY signal
|
||||
entry_price = option_chain['lastPrice'].iloc[i]
|
||||
if pd.isna(entry_price): # If price is nan, log the error and continue
|
||||
print(f"Missing entry price on {stock_data.index[i]}, skipping trade.")
|
||||
continue
|
||||
entry_date = stock_data.index[i]
|
||||
current_position = {
|
||||
'entry_price': entry_price,
|
||||
'entry_date': entry_date
|
||||
}
|
||||
print(f"BUY signal on {entry_date}: Entry Price = {entry_price}")
|
||||
|
||||
elif signals.iloc[i]['MA'] == -1 and current_position is not None:
|
||||
# SELL signal
|
||||
exit_price = option_chain['lastPrice'].iloc[i]
|
||||
if pd.isna(exit_price): # If price is nan, log the error and continue
|
||||
print(f"Missing exit price on {stock_data.index[i]}, skipping trade.")
|
||||
continue
|
||||
exit_date = stock_data.index[i]
|
||||
pnl = (exit_price - current_position['entry_price']) * 100
|
||||
print(f"SELL signal on {exit_date}: Exit Price = {exit_price}, P&L = {pnl}")
|
||||
|
||||
trades.append({
|
||||
'entry_date': current_position['entry_date'],
|
||||
'entry_price': current_position['entry_price'],
|
||||
'exit_date': exit_date,
|
||||
'exit_price': exit_price,
|
||||
'pnl': pnl
|
||||
})
|
||||
current_position = None
|
||||
|
||||
cumulative_pnl = sum(trade['pnl'] for trade in trades)
|
||||
total_wins = sum(1 for trade in trades if trade['pnl'] > 0)
|
||||
total_trades = len(trades)
|
||||
win_rate = total_wins / total_trades if total_trades > 0 else 0
|
||||
|
||||
return cumulative_pnl, trades, win_rate
|
||||
|
||||
|
||||
def objective_function_profit(weights, strategy_signals, data, option_chain):
|
||||
weights = np.array(weights)
|
||||
weights /= np.sum(weights) # Normalize weights
|
||||
weighted_signals = np.sum([signal * weight for signal, weight in zip(strategy_signals.T.values, weights)], axis=0)
|
||||
|
||||
# Since `backtest_option_trades` returns 3 values, we only unpack those
|
||||
cumulative_pnl, _, _ = backtest_option_trades(option_chain, weighted_signals, data)
|
||||
|
||||
# Return negative cumulative P&L to maximize profit
|
||||
return -cumulative_pnl
|
||||
|
||||
|
||||
def optimize_weights(strategy_signals, data, option_chain):
|
||||
initial_weights = [1 / len(strategy_signals.columns)] * len(strategy_signals.columns)
|
||||
constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})
|
||||
bounds = [(0, 1)] * len(strategy_signals.columns)
|
||||
|
||||
result = minimize(objective_function_profit, initial_weights, args=(strategy_signals, data, option_chain),
|
||||
method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
return result.x # Optimal weights
|
||||
|
||||
|
||||
def weighted_signal_combination(strategy_signals, weights):
|
||||
weighted_signals = np.sum([signal * weight for signal, weight in zip(strategy_signals.T.values, weights)], axis=0)
|
||||
return weighted_signals
|
||||
|
||||
|
||||
def main_decision(weighted_signals):
|
||||
last_signal = weighted_signals[-1] # Latest signal
|
||||
if last_signal > 0:
|
||||
return "BUY"
|
||||
elif last_signal < 0:
|
||||
return "SELL"
|
||||
else:
|
||||
return "HOLD"
|
||||
|
||||
|
||||
def run_backtest():
|
||||
ticker = ticker_info()
|
||||
expiration_dates = fetch_expiration_dates(ticker)
|
||||
expiration_date = select_expiration_date(expiration_dates)
|
||||
options_chain = fetch_option_chain(ticker, expiration_date)
|
||||
|
||||
# Fetch training data
|
||||
train_data = get_price_data(ticker, '2010-01-01', '2022-01-01')
|
||||
|
||||
# Generate signals
|
||||
strategy_signals_train = generate_signals(train_data)
|
||||
|
||||
# Optimize weights
|
||||
optimal_weights = optimize_weights(strategy_signals_train, train_data, options_chain.calls)
|
||||
|
||||
# Fetch test data
|
||||
test_data = get_price_data(ticker, '2022-01-02', '2024-01-01')
|
||||
|
||||
# Generate test signals
|
||||
strategy_signals_test = generate_signals(test_data)
|
||||
|
||||
# Combine signals and backtest
|
||||
weighted_signals = weighted_signal_combination(strategy_signals_test, optimal_weights)
|
||||
cumulative_pnl, trades, win_rate = backtest_option_trades(options_chain.calls, weighted_signals, test_data)
|
||||
|
||||
# Make final decision
|
||||
decision = main_decision(weighted_signals)
|
||||
print(f"Final decision: {decision}")
|
||||
|
||||
# Output results
|
||||
print(f"Cumulative P&L: {cumulative_pnl}")
|
||||
print(f"Win Rate: {win_rate * 100:.2f}%")
|
||||
|
||||
|
||||
# Call the main function
|
||||
run_backtest()
|
||||
Reference in New Issue
Block a user