switched to json, added keyword attraction, author extraction,excerpts exctraction, timestamp data, added future placeholder for rudimentary sentiment analysis maybe. Increased the number of articles used, added cap for article's analyzed to save cpu data, added a function to avoid repeat data collection of the same article

2024-10-31 00:04:22 -04:00
parent 3ae788ed9b
commit e638bea1da
3 changed files with 142 additions and 94 deletions
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
+++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_scraper.py.bak
@@ -1,76 +1,100 @@
+import json
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
-import pandas as pd
 import os
+import time
+import re

-# URL for OilPrice.com homepage
 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
-
-# Set up the data directory
 DATA_DIR = os.path.join(os.getcwd(), "data")
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

+def load_existing_data(file_path):
+    """Load existing data from JSON file to avoid duplicates."""
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    """Save scraped data to a JSON file, ensuring no duplicates."""
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data}
+
+    new_data = [article for article in data if article['link'] not in existing_links]
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Oil news data saved to {file_path}")
+
+def extract_keywords(text):
+    """Simple function to extract keywords from text."""
+    keywords = re.findall(r'\b\w+\b', text.lower())
+    return list(set(keywords))[:10]  # Return the first 10 unique keywords
+
 def scrape_oil_news():
    print("Scraping oil market news using Selenium...")

-    # Set up Selenium options
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)

-    driver.get(OIL_NEWS_URL)
-
-    # Wait until 'categoryArticle' elements load
-    try:
-        WebDriverWait(driver, 20).until(
-            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
-        )
-    except Exception as e:
-        print("Error: Content did not load properly.")
-        driver.quit()
-        return pd.DataFrame()
-
-    soup = BeautifulSoup(driver.page_source, "html.parser")
-    driver.quit()
-
-    # Parse the articles
-    articles = soup.find_all('div', class_='categoryArticle')
    news_data = []
+    page_number = 1
+    max_pages = 10  # Limit to 10 pages

-    print(f"Found {len(articles)} articles.")
+    while page_number <= max_pages:
+        # Load the page with pagination
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        
+        try:
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except Exception as e:
+            print(f"Error: Content did not load properly on page {page_number}.")
+            break

-    for i, article in enumerate(articles):
-        # Extract the title, link, and date using the adjusted structure
-        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
-        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
-        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        
+        articles = soup.find_all('div', class_='categoryArticle')
+        if not articles:
+            print(f"No articles found on page {page_number}. Ending pagination.")
+            break

-        # Log each article's details for debugging
-        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
+        for article in articles:
+            headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+            link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+            date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+            excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
+            author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
+            timestamp = date.split('|')[0].strip() if '|' in date else date
+            
+            if headline and link and date:
+                news_data.append({
+                    'headline': headline,
+                    'link': link,
+                    'date': timestamp,
+                    'author': author,
+                    'excerpt': excerpt,
+                    'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
+                    'sentiment_analysis': None  # Placeholder for future sentiment analysis
+                })

-        # Only add valid entries
-        if headline and link and date:
-            news_data.append({
-                'headline': headline,
-                'link': link,  # Assuming the link is already a full URL
-                'date': date
-            })
+        page_number += 1
+        time.sleep(2)

-    df = pd.DataFrame(news_data)
-    return df
+    driver.quit()
+    return news_data

 def run_scraper():
-    news_df = scrape_oil_news()
-    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
-
-    if not news_df.empty:
-        news_df.to_csv(file_path, index=False)
-        print(f"Oil news data saved to {file_path}")
-    else:
-        print("No data was scraped. The CSV file is empty.")
+    file_path = os.path.join(DATA_DIR, 'oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)

--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -1,76 +1,100 @@
+import json
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
-import pandas as pd
 import os
+import time
+import re

-# URL for OilPrice.com homepage
 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
-
-# Set up the data directory
 DATA_DIR = os.path.join(os.getcwd(), "data")
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

+def load_existing_data(file_path):
+    """Load existing data from JSON file to avoid duplicates."""
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    """Save scraped data to a JSON file, ensuring no duplicates."""
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data}
+
+    new_data = [article for article in data if article['link'] not in existing_links]
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Oil news data saved to {file_path}")
+
+def extract_keywords(text):
+    """Simple function to extract keywords from text."""
+    keywords = re.findall(r'\b\w+\b', text.lower())
+    return list(set(keywords))[:10]  # Return the first 10 unique keywords
+
 def scrape_oil_news():
    print("Scraping oil market news using Selenium...")

-    # Set up Selenium options
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)

-    driver.get(OIL_NEWS_URL)
-
-    # Wait until 'categoryArticle' elements load
-    try:
-        WebDriverWait(driver, 20).until(
-            EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
-        )
-    except Exception as e:
-        print("Error: Content did not load properly.")
-        driver.quit()
-        return pd.DataFrame()
-
-    soup = BeautifulSoup(driver.page_source, "html.parser")
-    driver.quit()
-
-    # Parse the articles
-    articles = soup.find_all('div', class_='categoryArticle')
    news_data = []
+    page_number = 1
+    max_pages = 10  # Limit to 10 pages

-    print(f"Found {len(articles)} articles.")
+    while page_number <= max_pages:
+        # Load the page with pagination
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        
+        try:
+            WebDriverWait(driver, 20).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except Exception as e:
+            print(f"Error: Content did not load properly on page {page_number}.")
+            break

-    for i, article in enumerate(articles):
-        # Extract the title, link, and date using the adjusted structure
-        headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
-        link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
-        date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        
+        articles = soup.find_all('div', class_='categoryArticle')
+        if not articles:
+            print(f"No articles found on page {page_number}. Ending pagination.")
+            break

-        # Log each article's details for debugging
-        print(f"Article {i+1} - Headline: {headline}, Link: {link}, Date: {date}")
+        for article in articles:
+            headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+            link = article.find('a', href=True)['href'] if article.find('a', href=True) else None
+            date = article.find('p', class_='categoryArticle__meta').get_text(strip=True) if article.find('p', class_='categoryArticle__meta') else None
+            excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
+            author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
+            timestamp = date.split('|')[0].strip() if '|' in date else date
+            
+            if headline and link and date:
+                news_data.append({
+                    'headline': headline,
+                    'link': link,
+                    'date': timestamp,
+                    'author': author,
+                    'excerpt': excerpt,
+                    'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
+                    'sentiment_analysis': None  # Placeholder for future sentiment analysis
+                })

-        # Only add valid entries
-        if headline and link and date:
-            news_data.append({
-                'headline': headline,
-                'link': link,  # Assuming the link is already a full URL
-                'date': date
-            })
+        page_number += 1
+        time.sleep(2)

-    df = pd.DataFrame(news_data)
-    return df
+    driver.quit()
+    return news_data

 def run_scraper():
-    news_df = scrape_oil_news()
-    file_path = os.path.join(DATA_DIR, 'oil_news.csv')
-
-    if not news_df.empty:
-        news_df.to_csv(file_path, index=False)
-        print(f"Oil news data saved to {file_path}")
-    else:
-        print("No data was scraped. The CSV file is empty.")
+    file_path = os.path.join(DATA_DIR, 'oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)