Entirely new file path. Addd Standard Practices, Moved Docs

2024-11-01 01:20:14 -04:00
parent b00fc25044
commit c98fffeada
38 changed files with 2201 additions and 455 deletions
--- a/src/API/API_1.ipynb
+++ b/src/API/API_1.ipynb
@@ -0,0 +1,184 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "69d88f26-f288-4a23-8be5-3e8317e23731",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "ERROR -1 2104 Market data farm connection is OK:usfarm.nj\n",
+      "ERROR -1 2104 Market data farm connection is OK:usfuture\n",
+      "ERROR -1 2104 Market data farm connection is OK:cashfarm\n",
+      "ERROR -1 2104 Market data farm connection is OK:usfarm\n",
+      "ERROR -1 2106 HMDS data farm connection is OK:ushmds\n",
+      "ERROR -1 2158 Sec-def data farm connection is OK:secdefnj\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Historical Data Ended\n",
+      "                  Date   Open   High    Low  Close  Volume\n",
+      "0   20241030  18:00:00  69.10  69.10  68.96  69.02     378\n",
+      "1   20241030  18:05:00  69.02  69.07  69.01  69.05      99\n",
+      "2   20241030  18:10:00  69.06  69.07  69.01  69.01     103\n",
+      "3   20241030  18:15:00  69.01  69.02  69.00  69.00      54\n",
+      "4   20241030  18:20:00  69.01  69.01  68.99  69.00      25\n",
+      "5   20241030  18:25:00  69.00  69.05  69.00  69.04      40\n",
+      "6   20241030  18:30:00  69.05  69.05  69.03  69.03      63\n",
+      "7   20241030  18:35:00  69.03  69.03  69.00  69.00      64\n",
+      "8   20241030  18:40:00  68.99  69.01  68.98  68.99      60\n",
+      "9   20241030  18:45:00  68.99  68.99  68.95  68.97      66\n",
+      "10  20241030  18:50:00  68.97  69.00  68.96  68.99      44\n",
+      "11  20241030  18:55:00  68.98  68.98  68.97  68.98      23\n",
+      "12  20241030  19:00:00  68.98  69.02  68.98  69.01      48\n",
+      "13  20241030  19:05:00  69.02  69.03  69.00  69.01      31\n",
+      "14  20241030  19:10:00  69.02  69.02  69.00  69.00      22\n",
+      "15  20241030  19:15:00  69.00  69.00  68.99  68.99      11\n",
+      "16  20241030  19:20:00  68.99  68.99  68.95  68.95      40\n",
+      "17  20241030  19:25:00  68.95  68.95  68.94  68.94      55\n",
+      "18  20241030  19:30:00  68.94  68.96  68.93  68.95      54\n",
+      "19  20241030  19:35:00  68.95  68.97  68.95  68.96      29\n",
+      "20  20241030  19:40:00  68.96  68.98  68.96  68.98      47\n",
+      "21  20241030  19:45:00  68.98  68.99  68.95  68.95      65\n",
+      "22  20241030  19:50:00  68.96  68.98  68.96  68.97      16\n",
+      "23  20241030  19:55:00  68.97  68.97  68.94  68.94      35\n",
+      "24  20241030  20:00:00  68.95  68.99  68.91  68.92     369\n",
+      "25  20241030  20:05:00  68.91  68.94  68.91  68.93      74\n",
+      "26  20241030  20:10:00  68.93  68.95  68.89  68.94     187\n",
+      "27  20241030  20:15:00  68.94  68.95  68.92  68.94      81\n",
+      "28  20241030  20:20:00  68.95  68.97  68.94  68.96      89\n",
+      "29  20241030  20:25:00  68.96  68.96  68.92  68.94      96\n",
+      "30  20241030  20:30:00  68.94  68.98  68.93  68.96      94\n",
+      "31  20241030  20:35:00  68.97  68.97  68.93  68.94      66\n",
+      "32  20241030  20:40:00  68.95  68.95  68.93  68.94      44\n",
+      "33  20241030  20:45:00  68.93  68.96  68.93  68.94      98\n",
+      "34  20241030  20:50:00  68.94  68.94  68.92  68.92      95\n"
+     ]
+    }
+   ],
+   "source": [
+    "from ibapi.client import EClient\n",
+    "from ibapi.wrapper import EWrapper\n",
+    "from ibapi.contract import Contract\n",
+    "import threading\n",
+    "import time\n",
+    "import pandas as pd\n",
+    "\n",
+    "# Define the IB API app\n",
+    "class IBApi(EWrapper, EClient):\n",
+    "    def __init__(self):\n",
+    "        EClient.__init__(self, self)\n",
+    "        self.data = []  # Initialize an empty list to store data\n",
+    "\n",
+    "    # Override the historicalData function to process and store incoming data\n",
+    "    def historicalData(self, reqId, bar):\n",
+    "        # Append the data as a dictionary to self.data\n",
+    "        self.data.append({\n",
+    "            \"Date\": bar.date,\n",
+    "            \"Open\": bar.open,\n",
+    "            \"High\": bar.high,\n",
+    "            \"Low\": bar.low,\n",
+    "            \"Close\": bar.close,\n",
+    "            \"Volume\": bar.volume\n",
+    "        })\n",
+    "\n",
+    "    def historicalDataEnd(self, reqId, start, end):\n",
+    "        print(\"Historical Data Ended\")\n",
+    "        # Convert the data to a DataFrame when data collection is complete\n",
+    "        self.df = pd.DataFrame(self.data)\n",
+    "        print(self.df)  # Display the DataFrame to verify\n",
+    "        self.disconnect()  # Disconnect after data collection is complete\n",
+    "\n",
+    "# Define the app handler for running in the notebook\n",
+    "class IBApp:\n",
+    "    def __init__(self):\n",
+    "        self.app = IBApi()\n",
+    "\n",
+    "    def connect(self):\n",
+    "        self.app.connect(\"127.0.0.1\", 7496, 0)  # Change port if needed\n",
+    "        thread = threading.Thread(target=self.run_app, daemon=True)\n",
+    "        thread.start()\n",
+    "        time.sleep(1)  # Allow time for the connection to establish\n",
+    "\n",
+    "    def run_app(self):\n",
+    "        self.app.run()\n",
+    "\n",
+    "    def request_oil_data(self):\n",
+    "        # Define the contract for Crude Oil Futures\n",
+    "        contract = Contract()\n",
+    "        contract.symbol = \"CL\"\n",
+    "        contract.secType = \"FUT\"\n",
+    "        contract.exchange = \"NYMEX\"\n",
+    "        contract.currency = \"USD\"\n",
+    "        contract.lastTradeDateOrContractMonth = \"202412\"  # Example: Dec 2024 contract\n",
+    "\n",
+    "        # Request historical data\n",
+    "        self.app.reqHistoricalData(\n",
+    "            reqId=1,\n",
+    "            contract=contract,\n",
+    "            endDateTime='',\n",
+    "            durationStr='1 D',  # 1 month\n",
+    "            barSizeSetting='5 mins',\n",
+    "            whatToShow='TRADES',\n",
+    "            useRTH=0,\n",
+    "            formatDate=1,\n",
+    "            keepUpToDate=False,\n",
+    "            chartOptions=[]\n",
+    "        )\n",
+    "\n",
+    "    def disconnect(self):\n",
+    "        self.app.disconnect()\n",
+    "\n",
+    "# Create an instance and connect\n",
+    "app = IBApp()\n",
+    "app.connect()\n",
+    "\n",
+    "# Request data and output to a DataFrame\n",
+    "app.request_oil_data()\n",
+    "\n",
+    "# Wait for data retrieval to complete\n",
+    "time.sleep(10)\n",
+    "\n",
+    "# Access the DataFrame\n",
+    "df = app.app.df if hasattr(app.app, 'df') else pd.DataFrame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "2088c621-81d3-46f0-8596-ce05d1a89fd4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = df.to_csv()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/API/API_2
+++ b/src/API/API_2
--- a/src/API/Trading_Bot_Development_Strategy
+++ b/src/API/Trading_Bot_Development_Strategy
--- a/src/Data-Collection/.gitignore
+++ b/src/Data-Collection/.gitignore
@@ -0,0 +1 @@
+venv/
--- a/src/Data-Collection/WebScraper/README.md
+++ b/src/Data-Collection/WebScraper/README.md
--- a/src/Data-Collection/WebScraper/assets/oil_key_words.txt
+++ b/src/Data-Collection/WebScraper/assets/oil_key_words.txt
@@ -0,0 +1,16 @@
+oil 5
+profit 4
+price 3
+gas 4
+energy 5
+production 3
+demand 2
+supply 2
+barrel 3
+economy 4
+investment 3
+revenue 4
+loss 2
+rise 5
+decline 1
+
--- a/src/Data-Collection/WebScraper/data/oil_news.json
+++ b/src/Data-Collection/WebScraper/data/oil_news.json
--- a/src/Data-Collection/WebScraper/data/preprocessed_oil_news.json
+++ b/src/Data-Collection/WebScraper/data/preprocessed_oil_news.json
--- a/src/Data-Collection/WebScraper/main.py
+++ b/src/Data-Collection/WebScraper/main.py
@@ -0,0 +1,47 @@
+import argparse
+import sys
+import time
+import scrapers.oil_news_scraper as oil_news
+import scrapers.oil_news_preprocessor as oil_news_preprocessor
+from tqdm import tqdm
+
+def show_usage_bar(duration):
+    for _ in tqdm(range(duration), desc="Processing", unit="sec"):
+        time.sleep(1)
+
+def run_scraper():
+    print("Starting oil data collection with the scraper...")
+    show_usage_bar(0)  # Simulated progress bar duration
+    oil_news.run_scraper()
+    print("Oil news data scraping completed.")
+
+def run_preprocessor():
+    print("Starting oil data collection with the preprocessor...")
+    show_usage_bar(0)  # Simulated progress bar duration
+    oil_news_preprocessor.run_preprocessor()
+    print("Oil news data preprocessing completed.")
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Oil News Data Collection Tool"
+    )
+    parser.add_argument(
+        "--scraper", action="store_true", help="Run the oil news scraper (original code)."
+    )
+    parser.add_argument(
+        "--preprocessed", action="store_true", help="Run the oil news preprocessor (new code for sentiment analysis)."
+    )
+
+    args = parser.parse_args()
+
+    if args.scraper:
+        run_scraper()
+    elif args.preprocessed:
+        run_preprocessor()
+    else:
+        print("No valid option selected. Use '--scraper' to run the scraper or '--preprocessed' to run the preprocessor.")
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
+
--- a/src/Data-Collection/WebScraper/main.py.bak
+++ b/src/Data-Collection/WebScraper/main.py.bak
@@ -0,0 +1,13 @@
+# main.py
+import scrapers.oil_news_scraper as oil_news
+
+def main():
+    print("Starting oil data collection...")
+
+    # Run oil market news scraper
+    oil_news.run_scraper()
+
+    print("Oil news data scraping completed.")
+
+if __name__ == "__main__":
+    main(),
--- a/src/Data-Collection/WebScraper/scrapers/pycache/oil_news_preprocessor.cpython-311.pyc
+++ b/src/Data-Collection/WebScraper/scrapers/pycache/oil_news_preprocessor.cpython-311.pyc
--- a/src/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/src/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/src/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
+++ b/src/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
@@ -0,0 +1,231 @@
+import json
+import re
+import os
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+from tqdm import tqdm  # Progress bar
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__))  # One level up
+DATA_DIR = os.path.join(SCRAPER_DIR, "data")
+KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
+
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data if 'link' in article}
+
+    new_data = []
+    for article in data:
+        if 'link' not in article or article['link'] in existing_links:
+            print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
+            continue
+        new_data.append(article)
+
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Data saved to {file_path}")
+
+def load_keyword_importance(file_path):
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    words = re.findall(r'\b\w+\b', text.lower())
+    keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
+
+def filter_content(content):
+    """Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
+    patterns = [
+        r'ADVERTISEMENT',                        
+        r'Click Here for \d+\+ Global Oil Prices',  
+        r'Find us on:',                          
+        r'Back to homepage',                     
+        r'Join the discussion',                  
+        r'More Top Reads From Oilprice.com',     
+        r'©OilPrice\.com.*?educational purposes', 
+        r'A Media Solutions.*?Oilprice.com',     
+        r'\"It\'s most important 8 minute read of my week…\"',  
+        r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',  
+        r'^.*?DNOW is a supplier.*?,',             
+    ]
+    
+    for pattern in patterns:
+        content = re.sub(pattern, '', content, flags=re.IGNORECASE)
+    content = re.sub(r'\s+', ' ', content).strip()
+    return content
+
+def extract_author_info(driver, article_soup, headline_pages=1):
+    """Extract detailed author information from the 'read more' link if available."""
+    author = "Unknown Author"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
+    if author_tag:
+        retries = 3  # Set retry limit
+        for attempt in range(retries):
+            try:
+                driver.get(author_tag['href'])
+                WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
+                )
+                bio_soup = BeautifulSoup(driver.page_source, "html.parser")
+                
+                # Extract author's name
+                author_name_tag = bio_soup.find('h1')
+                author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+                # Extract author's bio description
+                author_bio_tag = bio_soup.find('p')
+                author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+                # Extract contributor since date
+                contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
+                if contributor_since_tag:
+                    contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
+
+                # Extract headlines of latest articles by the author, limited by `headline_pages`
+                for page in range(1, headline_pages + 1):
+                    driver.get(f"{author_tag['href']}Page-{page}.html")
+                    WebDriverWait(driver, 10).until(
+                        EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+                    )
+                    page_soup = BeautifulSoup(driver.page_source, "html.parser")
+                    article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
+                    
+                    for article in article_tags:
+                        other_articles.append(article.get_text(strip=True))
+                
+                break  # Break loop if successful
+
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
+                time.sleep(2)  # Wait before retrying
+                if attempt == retries - 1:
+                    print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
+
+    return {
+        "name": author,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def scrape_oil_news():
+    print("Scraping oil news articles for sentiment analysis...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 1
+    total_articles = 0
+
+    while page_number <= max_pages:
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except:
+            break
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        total_articles += len(soup.find_all('div', class_='categoryArticle'))
+        page_number += 1
+
+    page_number = 1
+    with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
+        while page_number <= max_pages:
+            print(f"\nProcessing page {page_number}...")
+            driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            articles = soup.find_all('div', class_='categoryArticle')
+            if not articles:
+                break
+
+            for article in articles:
+                headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+                link_tag = article.find('a', href=True)
+                link = link_tag['href'] if link_tag else None
+                date_meta = article.find('p', class_='categoryArticle__meta')
+                date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
+                
+                content = ""
+                if link:
+                    print(f"Fetching article: {link}")
+                    driver.get(link)
+                    try:
+                        WebDriverWait(driver, 10).until(
+                            EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
+                        )
+                        article_soup = BeautifulSoup(driver.page_source, "html.parser")
+                        raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
+                        content = filter_content(raw_content)
+                        author, author_bio = extract_author_info(driver, article_soup)
+                    except:
+                        print(f"Error: Content did not load for article {headline}.")
+                
+                extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
+
+                if headline and link and date:
+                    author_info = extract_author_info(driver, article_soup, headline_pages=1)
+                    news_data.append({
+                        'headline': headline,
+                        'link': link,
+                        'content': content,
+                        'date': date,
+                        'author': author_info['name'],
+                        'author_bio': author_info['bio'],
+                        'contributor_since': author_info['contributor_since'],
+                        'other_articles': author_info['other_articles'],
+                        'keywords': extracted_keywords,
+                    })
+
+                pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
+                pbar.update(1)
+
+            page_number += 1
+            time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_preprocessor():
+    file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
+if __name__ == "__main__":
+    run_preprocessor()
+
--- a/src/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
+++ b/src/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
@@ -0,0 +1,100 @@
+import json
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import os
+import time
+import re
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+DATA_DIR = os.path.join(os.getcwd(), "data")
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    """Load existing data from JSON file to avoid duplicates."""
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    """Save scraped data to a JSON file, ensuring no duplicates."""
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data}
+
+    new_data = [article for article in data if article['link'] not in existing_links]
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Oil news data saved to {file_path}")
+
+def extract_keywords(text):
+    """Simple function to extract keywords from text."""
+    keywords = re.findall(r'\b\w+\b', text.lower())
+    return list(set(keywords))[:10]  # Return the first 10 unique keywords
+
+def scrape_oil_news():
+    print("Scraping oil market news using Selenium...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 10  # Limit to 10 pages
+
+    while page_number <= max_pages:
+        # Load the page with pagination
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        
+        try:
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except Exception as e:
+            print(f"Error: Content did not load properly on page {page_number}.")
+            break
+
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        
+        articles = soup.find_all('div', class_='categoryArticle')
+        if not articles:
+            print(f"No articles found on page {page_number}. Ending pagination.")
+            break
+
+        for article in articles:
+            headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+            link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+            date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+            excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
+            author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
+            timestamp = date.split('|')[0].strip() if '|' in date else date
+            
+            if headline and link and date:
+                news_data.append({
+                    'headline': headline,
+                    'link': link,
+                    'date': timestamp,
+                    'author': author,
+                    'excerpt': excerpt,
+                    'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
+                    'sentiment_analysis': None  # Placeholder for future sentiment analysis
+                })
+
+        page_number += 1
+        time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_scraper():
+    file_path = os.path.join(DATA_DIR, 'oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
--- a/src/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
+++ b/src/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
@@ -0,0 +1,251 @@
+import json
+import re
+import os
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+from tqdm import tqdm  # Progress bar
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__))  # One level up
+DATA_DIR = os.path.join(SCRAPER_DIR, "data")
+KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
+
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data if 'link' in article}
+
+    new_data = []
+    for article in data:
+        if 'link' not in article or article['link'] in existing_links:
+            print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
+            continue
+        new_data.append(article)
+
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Data saved to {file_path}")
+
+def load_keyword_importance(file_path):
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    words = re.findall(r'\b\w+\b', text.lower())
+    keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
+
+def filter_content(content):
+    """Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
+    patterns = [
+        r'ADVERTISEMENT',                        
+        r'Click Here for \d+\+ Global Oil Prices',  
+        r'Find us on:',                          
+        r'Back to homepage',                     
+        r'Join the discussion',                  
+        r'More Top Reads From Oilprice.com',     
+        r'©OilPrice\.com.*?educational purposes', 
+        r'A Media Solutions.*?Oilprice.com',     
+        r'\"It\'s most important 8 minute read of my week…\"',  
+        r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',  
+        r'^.*?DNOW is a supplier.*?,',             
+    ]
+    
+    for pattern in patterns:
+        content = re.sub(pattern, '', content, flags=re.IGNORECASE)
+    content = re.sub(r'\s+', ' ', content).strip()
+    return content
+
+def scrape_author_info(driver, author_url, headline_pages=1):
+    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
+    author_name = "Unknown"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    try:
+        # Load author page
+        driver.get(author_url)
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.TAG_NAME, "h1"))
+        )
+        page_source = driver.page_source
+        bio_soup = BeautifulSoup(page_source, "html.parser")
+
+        # Extract author name
+        author_name_tag = bio_soup.find('h1')
+        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+        # Extract author bio
+        author_bio_tag = bio_soup.find('div', class_='biography')
+        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+        # Extract contributor since date
+        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
+        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
+
+        # Extract latest articles by author with heading, excerpt, keywords, and timestamp
+        for page in range(1, headline_pages + 1):
+            driver.get(f"{author_url}/Page-{page}.html")
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
+            )
+            page_soup = BeautifulSoup(driver.page_source, "html.parser")
+            article_tags = page_soup.find_all('li', class_='clear')
+            
+            for article in article_tags:
+                heading_tag = article.find('h3')
+                excerpt_tag = article.find('p', class_='articlecontent')
+                timestamp_tag = article.find('div', class_='meta')
+
+                if heading_tag and excerpt_tag and timestamp_tag:
+                    heading = heading_tag.get_text(strip=True)
+                    excerpt = filter_content(excerpt_tag.get_text(strip=True))  # Use filter_content
+                    timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
+                    keywords = [keyword for keyword, _ in extract_keywords(excerpt, keyword_importance)]
+                    
+                    other_articles.append({
+                        "heading": heading,
+                        "excerpt": excerpt,
+                        "keywords": keywords,
+                        "published_date": timestamp
+                    })
+
+    except Exception as e:
+        print(f"Error scraping author info: {e}")
+        author_name = "Error Occurred"
+        author_bio = str(e)
+        contributor_since = "N/A"
+        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
+
+    return {
+        "name": author_name,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def scrape_oil_news():
+    print("Scraping oil news articles for sentiment analysis...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 1
+    total_articles = 0
+
+    while page_number <= max_pages:
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except:
+            break
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        total_articles += len(soup.find_all('div', class_='categoryArticle'))
+        page_number += 1
+
+    page_number = 1
+    with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
+        while page_number <= max_pages:
+            print(f"\nProcessing page {page_number}...")
+            driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            articles = soup.find_all('div', class_='categoryArticle')
+            if not articles:
+                break
+
+            for article in articles:
+                headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+                link_tag = article.find('a', href=True)
+                link = link_tag['href'] if link_tag else None
+                date_meta = article.find('p', class_='categoryArticle__meta')
+                date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
+                
+                content = ""
+                if link:
+                    print(f"Fetching article: {link}")
+                    driver.get(link)
+                    try:
+                        WebDriverWait(driver, 10).until(
+                            EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
+                        )
+                        article_soup = BeautifulSoup(driver.page_source, "html.parser")
+                        raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
+                        content = filter_content(raw_content)
+                        
+                        # Fetch author info using scrape_author_info
+                        author_url = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))['href']
+                        author_info = scrape_author_info(driver, author_url, headline_pages=1)
+                        
+                    except:
+                        print(f"Error: Content did not load for article {headline}.")
+                        author_info = {
+                            "name": "Unknown",
+                            "bio": "",
+                            "contributor_since": "",
+                            "other_articles": []
+                        }
+                
+                extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
+
+                if headline and link and date:
+                    news_data.append({
+                        'headline': headline,
+                        'link': link,
+                        'content': content,
+                        'date': date,
+                        'author': author_info['name'],
+                        'author_bio': author_info['bio'],
+                        'contributor_since': author_info['contributor_since'],
+                        'other_articles': author_info['other_articles'],
+                        'keywords': extracted_keywords,
+                    })
+
+                pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
+                pbar.update(1)
+
+            page_number += 1
+            time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_preprocessor():
+    file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
+if __name__ == "__main__":
+    run_preprocessor()
+
--- a/src/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/src/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -0,0 +1,143 @@
+import json
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+import os
+import time
+import re
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+DATA_DIR = os.path.join(os.getcwd(), "data")
+KEYWORD_FILE_PATH = os.path.join(os.getcwd(), "assets", "oil_key_words.txt")
+
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    """Load existing data from JSON file to avoid duplicates."""
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    """Save scraped data to a JSON file, ensuring no duplicates."""
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data}
+
+    new_data = []
+    for article in data:
+        if article['link'] in existing_links:
+            print(f"Skipping duplicate article: {article['headline']}")
+            continue
+        new_data.append(article)
+
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Oil news data saved to {file_path}")
+
+def load_keyword_importance(file_path):
+    """Load keyword importance values from the oil_key_words.txt file."""
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    """Extract important keywords from text based on an external keyword list."""
+    words = re.findall(r'\b\w+\b', text.lower())
+    keywords = {}
+    
+    for word in words:
+        if len(word) > 3 and word in keyword_importance:
+            keywords[word] = keyword_importance[word]  # Store keyword with its importance
+
+    # Return up to 10 unique keywords with their importance
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
+
+def analyze_sentiment(text):
+    """Basic sentiment analysis placeholder with minimal processing."""
+    # Only check for specific keywords; avoid complex logic to save time
+    if "profit" in text or "rise" in text:
+        return "Positive"
+    elif "loss" in text or "decline" in text:
+        return "Negative"
+    else:
+        return "Neutral"
+
+def scrape_oil_news():
+    print("Scraping oil market news using Selenium...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 10  # Limit to 10 pages
+
+    while page_number <= max_pages:
+        print(f"Processing page {page_number}...")
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        
+        try:
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except Exception as e:
+            print(f"Error: Content did not load properly on page {page_number}.")
+            break
+
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        
+        articles = soup.find_all('div', class_='categoryArticle')
+        if not articles:
+            print(f"No articles found on page {page_number}. Ending pagination.")
+            break
+
+        for article in articles:
+            headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+            link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+            date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+            excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
+            author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
+            timestamp = date.split('|')[0].strip() if '|' in date else date
+            extracted_keywords = extract_keywords(headline + " " + excerpt if excerpt else headline, keyword_importance)
+
+            if headline and link and date:
+                news_data.append({
+                    'headline': headline,
+                    'link': link,
+                    'date': timestamp,
+                    'author': author,
+                    'excerpt': excerpt,
+                    'keywords': extracted_keywords,
+                    'sentiment_analysis': None
+                    #'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
+                })
+
+        page_number += 1
+        time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_scraper():
+    file_path = os.path.join(DATA_DIR, 'oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
--- a/src/Data-Collection/WebScraper/scrapers/tests/author_info.json
+++ b/src/Data-Collection/WebScraper/scrapers/tests/author_info.json
@@ -0,0 +1,347 @@
+{
+    "name": "Charles Kennedy",
+    "bio": "Charles is a writer for Oilprice.com",
+    "contributor_since": "29 Sep 2011",
+    "other_articles": [
+        {
+            "heading": "Record Shale Production Helps ConocoPhillips Beat Profit Estimates",
+            "excerpt": "ConocoPhillips (NYSE: COP) is raising its ordinary dividend and share buyback program as its third-quarter earnings beat market expectations on the back of higher total…",
+            "keywords": [
+                "share",
+                "market",
+                "higher",
+                "back",
+                "total",
+                "expectations",
+                "third",
+                "beat",
+                "raising",
+                "conocophillips"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Rosneft to Resume Output at Idled Black Sea Refinery in November",
+            "excerpt": "Rosneft plans to resume crude processing at its Tuapse oil refinery on Russia’s Black Sea coast in November, after idling it for a month because…",
+            "keywords": [
+                "processing",
+                "idling",
+                "russia",
+                "plans",
+                "rosneft",
+                "refinery",
+                "tuapse",
+                "crude",
+                "november",
+                "black"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Canadian Natural Resources Q3 Profit Slips as Oil and Gas Prices Fall",
+            "excerpt": "Canada’s largest oil and gas producer, Canadian Natural Resources (NYSE: CNQ), reported lower adjusted net earnings from operations for the third quarter compared to a…",
+            "keywords": [
+                "canada",
+                "operations",
+                "producer",
+                "resources",
+                "reported",
+                "canadian",
+                "largest",
+                "third",
+                "natural",
+                "nyse"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Exelon Reports 80% Surge in Data Center Power Supply Deals",
+            "excerpt": "Exelon has seen an 80% increase in power supply deals coming from data enter operators in the latest sign that the IT industry is driving…",
+            "keywords": [
+                "industry",
+                "data",
+                "driving",
+                "seen",
+                "power",
+                "increase",
+                "exelon",
+                "deals",
+                "sign",
+                "that"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion",
+            "excerpt": "Gazprom is raising its investment plan for 2024 by 4% to $16.9 billion (1.642 trillion Russian rubles), thanks to rising exports and domestic supply, the…",
+            "keywords": [
+                "investment",
+                "russian",
+                "rubles",
+                "plan",
+                "exports",
+                "billion",
+                "raising",
+                "thanks",
+                "trillion",
+                "supply"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Investment Giants Form $50-Billion AI and Power Partnership",
+            "excerpt": "Global investment firm KKR and private-equity giant Energy Capital Partners on Wednesday announced a $50 billion strategic partnership to invest in data centers and power…",
+            "keywords": [
+                "centers",
+                "strategic",
+                "investment",
+                "giant",
+                "energy",
+                "capital",
+                "private",
+                "wednesday",
+                "billion",
+                "data"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE",
+            "excerpt": "Vietnam’s electric vehicle manufacturer VinFast Auto is expected to receive at least $1 billion in overseas funding led by Emirates Driving Company (EDC), Abu Dhabi’s…",
+            "keywords": [
+                "overseas",
+                "manufacturer",
+                "vietnam",
+                "expected",
+                "billion",
+                "driving",
+                "emirates",
+                "funding",
+                "receive",
+                "least"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Chinese Oil Major to Explore Iraqi Field",
+            "excerpt": "China’s CNOOC has inked a deal for exploration at an oil field in central Iraq, the company said today.\nThe deposit, Block 7, will be…",
+            "keywords": [
+                "deposit",
+                "cnooc",
+                "iraq",
+                "field",
+                "central",
+                "deal",
+                "today",
+                "said",
+                "china",
+                "inked"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "TotalEnergies to Produce More Gas Condensate Offshore Denmark",
+            "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
+            "keywords": [
+                "phillips",
+                "refining",
+                "giant",
+                "than",
+                "expected",
+                "higher",
+                "year",
+                "plunged",
+                "third",
+                "even"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3",
+            "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
+            "keywords": [
+                "phillips",
+                "refining",
+                "giant",
+                "than",
+                "expected",
+                "higher",
+                "year",
+                "plunged",
+                "third",
+                "even"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "UK Offshore Oil Platform Halted Due to Gas Compressor Issue",
+            "excerpt": "Production via the Triton Floating Production Storage & Offloading (FPSO) vessel in the UK North Sea has been halted due to a problem with the…",
+            "keywords": [
+                "fpso",
+                "been",
+                "with",
+                "problem",
+                "halted",
+                "storage",
+                "triton",
+                "vessel",
+                "offloading",
+                "north"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "UAE’s Renewable Energy Giant Pushes Back Green Hydrogen Targets",
+            "excerpt": "Masdar, the clean energy giant of the United Arab Emirates (UAE), has pushed back its target to reach 1 million tons per year of green…",
+            "keywords": [
+                "united",
+                "energy",
+                "giant",
+                "emirates",
+                "back",
+                "year",
+                "million",
+                "arab",
+                "pushed",
+                "target"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "Profit at India’s Top Refiner Slumps by 99% Due to Weak Margins",
+            "excerpt": "IndianOil, the biggest refiner in India, reported on Monday a net profit tumbling by 98.6% in the quarter to September from a year ago amid…",
+            "keywords": [
+                "refiner",
+                "monday",
+                "september",
+                "biggest",
+                "reported",
+                "indianoil",
+                "india",
+                "year",
+                "tumbling",
+                "profit"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "Average U.S. Gasoline Price Set to Drop Below $3 for the First Time Since 2021",
+            "excerpt": "The U.S. national average price of gasoline is set to soon fall below $3 per gallon for the first time since 2021, amid lower seasonal…",
+            "keywords": [
+                "gasoline",
+                "national",
+                "below",
+                "gallon",
+                "soon",
+                "first",
+                "lower",
+                "average",
+                "seasonal",
+                "price"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "FERC Grants Exxon and Qatar Three-Year Extension to Build Golden Pass LNG",
+            "excerpt": "The U.S. Federal Energy Regulatory Commission has granted a three-year extension to ExxonMobil and QatarEnergy to build their $10-billion Golden Pass LNG export plant in…",
+            "keywords": [
+                "federal",
+                "export",
+                "three",
+                "energy",
+                "golden",
+                "billion",
+                "year",
+                "their",
+                "qatarenergy",
+                "regulatory"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "Cepsa: Windfall Tax Would Delay Its $3.3-Billion Hydrogen Plan",
+            "excerpt": "Cepsa, Spain’s second-largest oil company, will delay its $3.25 billion (3 billion euros) investment into domestic green hydrogen projects if Spain makes the windfall tax…",
+            "keywords": [
+                "investment",
+                "second",
+                "projects",
+                "billion",
+                "euros",
+                "largest",
+                "into",
+                "delay",
+                "will",
+                "cepsa"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "South Africa Seeks Loan Guarantees for Energy Transition Funding",
+            "excerpt": "South Africa is currently negotiating loan guarantees with its international partners in its $9.3-billion Just Energy Transition Partnership (JETP) program for energy investment.\nThe International…",
+            "keywords": [
+                "jetp",
+                "negotiating",
+                "energy",
+                "transition",
+                "currently",
+                "investment",
+                "billion",
+                "south",
+                "africa",
+                "guarantees"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "Saudi Oil Export Revenues Hit Three-Year Low as Prices Decline",
+            "excerpt": "Lower crude oil prices dragged Saudi Arabia’s oil export revenues to the lowest level in more than three years in August, amid underwhelming oil demand…",
+            "keywords": [
+                "years",
+                "three",
+                "august",
+                "than",
+                "more",
+                "dragged",
+                "revenues",
+                "saudi",
+                "crude",
+                "prices"
+            ],
+            "published_date": "24 October 2024"
+        },
+        {
+            "heading": "Tesla Stock Soars After Q3 Earnings Beat",
+            "excerpt": "Tesla (NASDAQ: TSLA) saw its shares jump by 20% after hours on Wednesday and another 14% in pre-market trade on Thursday after reporting earnings for…",
+            "keywords": [
+                "thursday",
+                "after",
+                "trade",
+                "market",
+                "tesla",
+                "wednesday",
+                "another",
+                "nasdaq",
+                "hours",
+                "reporting"
+            ],
+            "published_date": "24 October 2024"
+        },
+        {
+            "heading": "Oil Refining Giant Valero Tops Estimates Despite Q3 Profit Plunge",
+            "excerpt": "One of the biggest U.S. refiners, Valero Energy (NYSE: VLO), beat Wall Street estimates even as it reported a widely expected plunge in its third-quarter…",
+            "keywords": [
+                "street",
+                "energy",
+                "biggest",
+                "wall",
+                "reported",
+                "expected",
+                "plunge",
+                "widely",
+                "third",
+                "valero"
+            ],
+            "published_date": "24 October 2024"
+        }
+    ]
+}
--- a/src/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
+++ b/src/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
@@ -0,0 +1,109 @@
+import json
+import re
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+
+AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy"  # Replace with actual author URL
+OUTPUT_FILE = "author_info.json"
+
+def extract_keywords(text):
+    """Basic keyword extraction by finding unique words longer than 3 characters."""
+    words = re.findall(r'\b\w{4,}\b', text.lower())
+    keywords = list(set(words))
+    return keywords[:10]  # Limit to top 10 unique keywords for simplicity
+
+def scrape_author_info(author_url, headline_pages=1):
+    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    author_name = "Unknown"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    try:
+        # Load author page
+        driver.get(author_url)
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.TAG_NAME, "h1"))
+        )
+        page_source = driver.page_source
+        bio_soup = BeautifulSoup(page_source, "html.parser")
+
+        # Extract author name
+        author_name_tag = bio_soup.find('h1')
+        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+        # Extract author bio
+        author_bio_tag = bio_soup.find('div', class_='biography')
+        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+        # Extract contributor since date
+        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
+        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
+
+        # Extract latest articles by author with heading, excerpt, keywords, and timestamp
+        for page in range(1, headline_pages + 1):
+            driver.get(f"{author_url}/Page-{page}.html")
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
+            )
+            page_soup = BeautifulSoup(driver.page_source, "html.parser")
+            article_tags = page_soup.find_all('li', class_='clear')
+            
+            for article in article_tags:
+                heading_tag = article.find('h3')
+                excerpt_tag = article.find('p', class_='articlecontent')
+                timestamp_tag = article.find('div', class_='meta')
+
+                if heading_tag and excerpt_tag and timestamp_tag:
+                    heading = heading_tag.get_text(strip=True)
+                    excerpt = excerpt_tag.get_text(strip=True)
+                    timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
+                    keywords = extract_keywords(excerpt)
+                    
+                    other_articles.append({
+                        "heading": heading,
+                        "excerpt": excerpt,
+                        "keywords": keywords,
+                        "published_date": timestamp
+                    })
+
+    except Exception as e:
+        print(f"Error scraping author info: {e}")
+        author_name = "Error Occurred"
+        author_bio = str(e)
+        contributor_since = "N/A"
+        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
+
+    finally:
+        driver.quit()
+
+    return {
+        "name": author_name,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def save_to_json(data, output_file):
+    """Save author info to a JSON file."""
+    with open(output_file, mode="w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+    print(f"Author info saved to {output_file}")
+
+if __name__ == "__main__":
+    # Scrape author info
+    author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
+
+    # Save to JSON
+    save_to_json(author_info, OUTPUT_FILE)
+
--- a/src/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak
+++ b/src/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak
@@ -0,0 +1,106 @@
+import json
+import re
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+
+AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy"  # Replace with actual author URL
+OUTPUT_FILE = "author_info.json"
+
+def extract_keywords(text):
+    """Basic keyword extraction by finding unique words longer than 3 characters."""
+    words = re.findall(r'\b\w{4,}\b', text.lower())
+    keywords = list(set(words))
+    return keywords[:10]  # Limit to top 10 unique keywords for simplicity
+
+def scrape_author_info(author_url, headline_pages=1):
+    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    author_name = "Unknown"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    try:
+        # Load author page
+        driver.get(author_url)
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.TAG_NAME, "h1"))
+        )
+        page_source = driver.page_source
+        bio_soup = BeautifulSoup(page_source, "html.parser")
+
+        # Extract author name
+        author_name_tag = bio_soup.find('h1')
+        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+        # Extract author bio
+        author_bio_tag = bio_soup.find('div', class_='biography')
+        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+        # Extract contributor since date
+        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
+        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
+
+        # Extract latest articles by author with heading, excerpt, and keywords
+        for page in range(1, headline_pages + 1):
+            driver.get(f"{author_url}/Page-{page}.html")
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
+            )
+            page_soup = BeautifulSoup(driver.page_source, "html.parser")
+            article_tags = page_soup.find_all('li', class_='clear')
+            
+            for article in article_tags:
+                heading_tag = article.find('h3')
+                excerpt_tag = article.find('p', class_='articlecontent')
+                
+                if heading_tag and excerpt_tag:
+                    heading = heading_tag.get_text(strip=True)
+                    excerpt = excerpt_tag.get_text(strip=True)
+                    keywords = extract_keywords(excerpt)
+                    
+                    other_articles.append({
+                        "heading": heading,
+                        "excerpt": excerpt,
+                        "keywords": keywords
+                    })
+
+    except Exception as e:
+        print(f"Error scraping author info: {e}")
+        author_name = "Error Occurred"
+        author_bio = str(e)
+        contributor_since = "N/A"
+        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
+
+    finally:
+        driver.quit()
+
+    return {
+        "name": author_name,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def save_to_json(data, output_file):
+    """Save author info to a JSON file."""
+    with open(output_file, mode="w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+    print(f"Author info saved to {output_file}")
+
+if __name__ == "__main__":
+    # Scrape author info
+    author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
+
+    # Save to JSON
+    save_to_json(author_info, OUTPUT_FILE)
+
--- a/src/Data-Collection/WebScraper/scrapers/tests/selenium_webdriver_test.py
+++ b/src/Data-Collection/WebScraper/scrapers/tests/selenium_webdriver_test.py
@@ -0,0 +1,26 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.common.by import By
+import time
+
+# Provide the path to your geckodriver executable using the Service class
+service = Service(executable_path='/usr/local/bin/geckodriver')
+driver = webdriver.Firefox(service=service)
+
+# Open a website (e.g., OilPrice.com)
+driver.get("https://oilprice.com/Latest-Energy-News/World-News/")
+
+# Wait for the page to load
+time.sleep(5)
+
+# Print the title of the page to verify that it's loaded
+print(driver.title)
+
+# Find and print some element on the page, e.g., all article titles
+articles = driver.find_elements(By.CSS_SELECTOR, "div.categoryArticle")
+for article in articles:
+    title = article.find_element(By.TAG_NAME, "a").text
+    print(f"Article title: {title}")
+
+# Close the browser
+driver.quit()
--- a/src/Data-Collection/WebScraper/setup.py
+++ b/src/Data-Collection/WebScraper/setup.py
--- a/src/GUSHTradingBotV1.0.py
+++ b/src/GUSHTradingBotV1.0.py
@@ -0,0 +1,246 @@
+import numpy as np
+import pandas as pd
+import yfinance as yf
+from scipy.optimize import minimize
+
+
+def ticker_info():
+    ticker = "gush"
+    return ticker.upper()
+
+
+def fetch_expiration_dates(ticker):
+    print(f"Fetching available expiration dates for {ticker}...")
+    stock = yf.Ticker(ticker)
+    expiration_dates = stock.options
+    print(f"Available expiration dates: {expiration_dates}")
+    return expiration_dates
+
+
+def select_expiration_date(expiration_dates):
+    print("Selecting the first available expiration date...")
+    expiration_date = expiration_dates[0]
+    print(f"Selected expiration date: {expiration_date}")
+    return expiration_date
+
+
+def fetch_option_chain(ticker, expiration_date):
+    print(f"Fetching option chain for {ticker} with expiration date {expiration_date}...")
+    stock = yf.Ticker(ticker)
+    options_chain = stock.option_chain(expiration_date)
+    print("Option chain fetched successfully!")
+    return options_chain
+
+
+def get_price_data(ticker, start_date, end_date):
+    print(f"Fetching price data for {ticker} from {start_date} to {end_date}...")
+    data = yf.download(ticker, start=start_date, end=end_date)
+    print(f"Price data fetched successfully for {ticker}!")
+    return data
+
+
+def moving_average_strategy(data, short_window=20, long_window=50):
+    data['Short_MA'] = data['Close'].rolling(window=short_window).mean()
+    data['Long_MA'] = data['Close'].rolling(window=long_window).mean()
+    data['Signal'] = np.where(data['Short_MA'] > data['Long_MA'], 1, -1)
+    return data['Signal']
+
+def rsi_strategy(data, window=14, overbought=70, oversold=30):
+    delta = data['Close'].diff(1)
+    gain = np.where(delta > 0, delta, 0).flatten()  # Flatten to 1D array
+    loss = np.where(delta < 0, abs(delta), 0).flatten()  # Flatten to 1D array
+    
+    avg_gain = pd.Series(gain).rolling(window=window).mean()
+    avg_loss = pd.Series(loss).rolling(window=window).mean()
+    
+    # Avoid division by zero by using np.where to replace 0 with np.nan in avg_loss
+    rs = avg_gain / np.where(avg_loss == 0, np.nan, avg_loss)  
+    
+    rsi = 100 - (100 / (1 + rs))
+    
+    signal = np.where(rsi < oversold, 1, np.where(rsi > overbought, -1, 0))
+    return pd.Series(signal, index=data.index)
+
+def bollinger_bands_strategy(data, window=20, num_std=2):
+    # Calculate moving average
+    data['Moving_Avg'] = data['Close'].rolling(window=window).mean()
+
+    # Calculate rolling standard deviation and force it to be a Series
+    rolling_std = data['Close'].rolling(window).std()
+    rolling_std = rolling_std.squeeze()  # Ensure rolling_std is a Series
+
+    # Print shapes for debugging
+    print(f"Shape of Moving_Avg: {data['Moving_Avg'].shape}")
+    print(f"Shape of Rolling Std: {rolling_std.shape}")
+
+    # Calculate upper and lower bands
+    data['Band_Upper'] = data['Moving_Avg'] + (num_std * rolling_std)
+    data['Band_Lower'] = data['Moving_Avg'] - (num_std * rolling_std)
+
+    # Print shapes after assignments for debugging
+    print(f"Shape of Band_Upper: {data['Band_Upper'].shape}")
+    print(f"Shape of Band_Lower: {data['Band_Lower'].shape}")
+
+    # Check for NaN values
+    print(f"NaNs in Close: {data['Close'].isna().sum()}")
+    print(f"NaNs in Band_Upper: {data['Band_Upper'].isna().sum()}")
+    print(f"NaNs in Band_Lower: {data['Band_Lower'].isna().sum()}")
+
+    # Print the columns of the DataFrame
+    print(f"Columns in data before dropping NaNs: {data.columns.tolist()}")
+
+    # Optionally drop rows with NaNs
+    data = data.dropna(subset=['Close', 'Band_Upper', 'Band_Lower'])
+
+    # Generate signals based on the bands
+    signal = np.where(data['Close'] < data['Band_Lower'], 1, 
+                      np.where(data['Close'] > data['Band_Upper'], -1, 0))
+    
+    return pd.Series(signal, index=data.index)
+
+def generate_signals(data):
+    ma_signal = moving_average_strategy(data)
+    rsi_signal = rsi_strategy(data)
+    bollinger_signal = bollinger_bands_strategy(data)
+    return pd.DataFrame({'MA': ma_signal, 'RSI': rsi_signal, 'Bollinger': bollinger_signal})
+
+
+def backtest_option_trades(option_chain, signals, stock_data):
+    """
+    Backtest option trades based on the given signals and stock data.
+    """
+    trades = []
+    current_position = None
+
+    # Ensure both stock_data and option_chain indices are sorted in ascending order
+    stock_data = stock_data.sort_index()
+
+    # Convert 'lastTradeDate' or any date-related columns to datetime in option_chain
+    if 'lastTradeDate' in option_chain.columns:
+        option_chain['lastTradeDate'] = pd.to_datetime(option_chain['lastTradeDate'])
+        option_chain = option_chain.set_index('lastTradeDate')
+
+    # If option_chain index isn't datetime, convert it to datetime (ensuring compatibility)
+    option_chain.index = pd.to_datetime(option_chain.index)
+
+    # Remove the timezone from option_chain index
+    option_chain.index = option_chain.index.tz_localize(None)
+
+    # Now reindex the option chain to match the stock data index (forward fill missing option prices)
+    option_chain = option_chain.sort_index()
+    option_chain = option_chain.reindex(stock_data.index, method='ffill')
+
+    for i in range(len(signals)):
+        if signals.iloc[i]['MA'] == 1 and current_position is None:
+            # BUY signal
+            entry_price = option_chain['lastPrice'].iloc[i]
+            if pd.isna(entry_price):  # If price is nan, log the error and continue
+                print(f"Missing entry price on {stock_data.index[i]}, skipping trade.")
+                continue
+            entry_date = stock_data.index[i]
+            current_position = {
+                'entry_price': entry_price,
+                'entry_date': entry_date
+            }
+            print(f"BUY signal on {entry_date}: Entry Price = {entry_price}")
+        
+        elif signals.iloc[i]['MA'] == -1 and current_position is not None:
+            # SELL signal
+            exit_price = option_chain['lastPrice'].iloc[i]
+            if pd.isna(exit_price):  # If price is nan, log the error and continue
+                print(f"Missing exit price on {stock_data.index[i]}, skipping trade.")
+                continue
+            exit_date = stock_data.index[i]
+            pnl = (exit_price - current_position['entry_price']) * 100
+            print(f"SELL signal on {exit_date}: Exit Price = {exit_price}, P&L = {pnl}")
+
+            trades.append({
+                'entry_date': current_position['entry_date'],
+                'entry_price': current_position['entry_price'],
+                'exit_date': exit_date,
+                'exit_price': exit_price,
+                'pnl': pnl
+            })
+            current_position = None
+
+    cumulative_pnl = sum(trade['pnl'] for trade in trades)
+    total_wins = sum(1 for trade in trades if trade['pnl'] > 0)
+    total_trades = len(trades)
+    win_rate = total_wins / total_trades if total_trades > 0 else 0
+
+    return cumulative_pnl, trades, win_rate
+
+
+def objective_function_profit(weights, strategy_signals, data, option_chain):
+    weights = np.array(weights)
+    weights /= np.sum(weights)  # Normalize weights
+    weighted_signals = np.sum([signal * weight for signal, weight in zip(strategy_signals.T.values, weights)], axis=0)
+
+    # Since `backtest_option_trades` returns 3 values, we only unpack those
+    cumulative_pnl, _, _ = backtest_option_trades(option_chain, weighted_signals, data)
+
+    # Return negative cumulative P&L to maximize profit
+    return -cumulative_pnl
+
+
+def optimize_weights(strategy_signals, data, option_chain):
+    initial_weights = [1 / len(strategy_signals.columns)] * len(strategy_signals.columns)
+    constraints = ({'type': 'eq', 'fun': lambda weights: np.sum(weights) - 1})
+    bounds = [(0, 1)] * len(strategy_signals.columns)
+
+    result = minimize(objective_function_profit, initial_weights, args=(strategy_signals, data, option_chain),
+                      method='SLSQP', bounds=bounds, constraints=constraints)
+    return result.x  # Optimal weights
+
+
+def weighted_signal_combination(strategy_signals, weights):
+    weighted_signals = np.sum([signal * weight for signal, weight in zip(strategy_signals.T.values, weights)], axis=0)
+    return weighted_signals
+
+
+def main_decision(weighted_signals):
+    last_signal = weighted_signals[-1]  # Latest signal
+    if last_signal > 0:
+        return "BUY"
+    elif last_signal < 0:
+        return "SELL"
+    else:
+        return "HOLD"
+
+
+def run_backtest():
+    ticker = ticker_info()
+    expiration_dates = fetch_expiration_dates(ticker)
+    expiration_date = select_expiration_date(expiration_dates)
+    options_chain = fetch_option_chain(ticker, expiration_date)
+
+    # Fetch training data
+    train_data = get_price_data(ticker, '2010-01-01', '2022-01-01')
+
+    # Generate signals
+    strategy_signals_train = generate_signals(train_data)
+
+    # Optimize weights
+    optimal_weights = optimize_weights(strategy_signals_train, train_data, options_chain.calls)
+
+    # Fetch test data
+    test_data = get_price_data(ticker, '2022-01-02', '2024-01-01')
+
+    # Generate test signals
+    strategy_signals_test = generate_signals(test_data)
+
+    # Combine signals and backtest
+    weighted_signals = weighted_signal_combination(strategy_signals_test, optimal_weights)
+    cumulative_pnl, trades, win_rate = backtest_option_trades(options_chain.calls, weighted_signals, test_data)
+
+    # Make final decision
+    decision = main_decision(weighted_signals)
+    print(f"Final decision: {decision}")
+
+    # Output results
+    print(f"Cumulative P&L: {cumulative_pnl}")
+    print(f"Win Rate: {win_rate * 100:.2f}%")
+
+
+# Call the main function
+run_backtest()