added new mode

2024-10-31 02:40:16 -04:00
parent 1fcd98da06
commit fc7681ed68
11 changed files with 1938 additions and 2226 deletions
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_preprocessor.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_preprocessor.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
+++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
@@ -0,0 +1,202 @@
+import json
+import re
+import os
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+from tqdm import tqdm  # Progress bar
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__))  # One level up
+DATA_DIR = os.path.join(SCRAPER_DIR, "data")
+KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
+
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data if 'link' in article}
+
+    new_data = []
+    for article in data:
+        if 'link' not in article or article['link'] in existing_links:
+            print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
+            continue
+        new_data.append(article)
+
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Data saved to {file_path}")
+
+def load_keyword_importance(file_path):
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    words = re.findall(r'\b\w+\b', text.lower())
+    keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
+
+def filter_content(content):
+    """Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
+    patterns = [
+        r'ADVERTISEMENT',                        
+        r'Click Here for \d+\+ Global Oil Prices',  
+        r'Find us on:',                          
+        r'Back to homepage',                     
+        r'Join the discussion',                  
+        r'More Top Reads From Oilprice.com',     
+        r'©OilPrice\.com.*?educational purposes', 
+        r'A Media Solutions.*?Oilprice.com',     
+        r'\"It\'s most important 8 minute read of my week…\"',  
+        r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',  
+        r'^.*?DNOW is a supplier.*?,',             
+    ]
+    
+    for pattern in patterns:
+        content = re.sub(pattern, '', content, flags=re.IGNORECASE)
+    content = re.sub(r'\s+', ' ', content).strip()
+    return content
+
+def extract_author_info(driver, article_soup):
+    """Extract detailed author information from the 'read more' link if available."""
+    author = "Unknown Author"
+    author_bio = ""
+    
+    author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
+    if author_tag:
+        try:
+            driver.get(author_tag['href'])
+            # Increased wait time to handle slow-loading pages
+            WebDriverWait(driver, 15).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
+            )
+            bio_soup = BeautifulSoup(driver.page_source, "html.parser")
+            
+            # Primary search for author name and bio
+            author_name_tag = bio_soup.find('h1')
+            author_bio_tag = bio_soup.find('p')
+            
+            # Fallback if primary elements are not found
+            if not author_name_tag or not author_bio_tag:
+                author_name_tag = bio_soup.find('span', class_='author-name')  # Hypothetical class for author name
+                author_bio_tag = bio_soup.find('div', class_='bio-content')   # Hypothetical class for bio content
+            
+            author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+            author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+        
+        except Exception as e:
+            print(f"Author bio page failed to load or extract. Error: {e}")
+    
+    return author, author_bio
+
+def scrape_oil_news():
+    print("Scraping oil news articles for sentiment analysis...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 1
+    total_articles = 0
+
+    while page_number <= max_pages:
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except:
+            break
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        total_articles += len(soup.find_all('div', class_='categoryArticle'))
+        page_number += 1
+
+    page_number = 1
+    with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
+        while page_number <= max_pages:
+            print(f"\nProcessing page {page_number}...")
+            driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            articles = soup.find_all('div', class_='categoryArticle')
+            if not articles:
+                break
+
+            for article in articles:
+                headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+                link_tag = article.find('a', href=True)
+                link = link_tag['href'] if link_tag else None
+                date_meta = article.find('p', class_='categoryArticle__meta')
+                date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
+                
+                content = ""
+                if link:
+                    print(f"Fetching article: {link}")
+                    driver.get(link)
+                    try:
+                        WebDriverWait(driver, 10).until(
+                            EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
+                        )
+                        article_soup = BeautifulSoup(driver.page_source, "html.parser")
+                        raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
+                        content = filter_content(raw_content)
+                        author, author_bio = extract_author_info(driver, article_soup)
+                    except:
+                        print(f"Error: Content did not load for article {headline}.")
+                
+                extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
+
+                if headline and link and date:
+                    news_data.append({
+                        'headline': headline,
+                        'link': link,
+                        'content': content,
+                        'date': date,
+                        'author': author,
+                        'author_bio': author_bio,
+                        'keywords': extracted_keywords,
+                    })
+
+                pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
+                pbar.update(1)
+
+            page_number += 1
+            time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_preprocessor():
+    file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
+if __name__ == "__main__":
+    run_preprocessor()
+
--- a/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_preprocessor.py
@@ -0,0 +1,231 @@
+import json
+import re
+import os
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+from tqdm import tqdm  # Progress bar
+
+OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
+SCRAPER_DIR = os.path.dirname(os.path.dirname(__file__))  # One level up
+DATA_DIR = os.path.join(SCRAPER_DIR, "data")
+KEYWORD_FILE_PATH = os.path.join(SCRAPER_DIR, "assets", "oil_key_words.txt")
+
+if not os.path.exists(DATA_DIR):
+    os.makedirs(DATA_DIR)
+
+def load_existing_data(file_path):
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    return []
+
+def save_to_json(data, file_path):
+    existing_data = load_existing_data(file_path)
+    existing_links = {article['link'] for article in existing_data if 'link' in article}
+
+    new_data = []
+    for article in data:
+        if 'link' not in article or article['link'] in existing_links:
+            print(f"Skipping duplicate or missing link article: {article.get('headline', 'Unknown Headline')}")
+            continue
+        new_data.append(article)
+
+    combined_data = existing_data + new_data
+
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(combined_data, f, ensure_ascii=False, indent=4)
+    print(f"Data saved to {file_path}")
+
+def load_keyword_importance(file_path):
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    words = re.findall(r'\b\w+\b', text.lower())
+    keywords = {word: keyword_importance[word] for word in words if word in keyword_importance}
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]
+
+def filter_content(content):
+    """Remove advertisements, irrelevant phrases, headers, and disclaimers from content."""
+    patterns = [
+        r'ADVERTISEMENT',                        
+        r'Click Here for \d+\+ Global Oil Prices',  
+        r'Find us on:',                          
+        r'Back to homepage',                     
+        r'Join the discussion',                  
+        r'More Top Reads From Oilprice.com',     
+        r'©OilPrice\.com.*?educational purposes', 
+        r'A Media Solutions.*?Oilprice.com',     
+        r'\"It\'s most important 8 minute read of my week…\"',  
+        r'^[\w\s]*?is a [\w\s]*? for Oilprice\.com.*?More Info',  
+        r'^.*?DNOW is a supplier.*?,',             
+    ]
+    
+    for pattern in patterns:
+        content = re.sub(pattern, '', content, flags=re.IGNORECASE)
+    content = re.sub(r'\s+', ' ', content).strip()
+    return content
+
+def extract_author_info(driver, article_soup, headline_pages=1):
+    """Extract detailed author information from the 'read more' link if available."""
+    author = "Unknown Author"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
+    if author_tag:
+        retries = 3  # Set retry limit
+        for attempt in range(retries):
+            try:
+                driver.get(author_tag['href'])
+                WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
+                )
+                bio_soup = BeautifulSoup(driver.page_source, "html.parser")
+                
+                # Extract author's name
+                author_name_tag = bio_soup.find('h1')
+                author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+                # Extract author's bio description
+                author_bio_tag = bio_soup.find('p')
+                author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+                # Extract contributor since date
+                contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
+                if contributor_since_tag:
+                    contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
+
+                # Extract headlines of latest articles by the author, limited by `headline_pages`
+                for page in range(1, headline_pages + 1):
+                    driver.get(f"{author_tag['href']}Page-{page}.html")
+                    WebDriverWait(driver, 10).until(
+                        EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+                    )
+                    page_soup = BeautifulSoup(driver.page_source, "html.parser")
+                    article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
+                    
+                    for article in article_tags:
+                        other_articles.append(article.get_text(strip=True))
+                
+                break  # Break loop if successful
+
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
+                time.sleep(2)  # Wait before retrying
+                if attempt == retries - 1:
+                    print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
+
+    return {
+        "name": author,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def scrape_oil_news():
+    print("Scraping oil news articles for sentiment analysis...")
+
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    news_data = []
+    page_number = 1
+    max_pages = 1
+    total_articles = 0
+
+    while page_number <= max_pages:
+        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+            )
+        except:
+            break
+        soup = BeautifulSoup(driver.page_source, "html.parser")
+        total_articles += len(soup.find_all('div', class_='categoryArticle'))
+        page_number += 1
+
+    page_number = 1
+    with tqdm(total=total_articles, desc="Scraping articles", unit="article") as pbar:
+        while page_number <= max_pages:
+            print(f"\nProcessing page {page_number}...")
+            driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
+            soup = BeautifulSoup(driver.page_source, "html.parser")
+            articles = soup.find_all('div', class_='categoryArticle')
+            if not articles:
+                break
+
+            for article in articles:
+                headline = article.find('h2', class_='categoryArticle__title').get_text(strip=True) if article.find('h2', class_='categoryArticle__title') else None
+                link_tag = article.find('a', href=True)
+                link = link_tag['href'] if link_tag else None
+                date_meta = article.find('p', class_='categoryArticle__meta')
+                date = date_meta.get_text(strip=True).split('|')[0].strip() if date_meta else None
+                
+                content = ""
+                if link:
+                    print(f"Fetching article: {link}")
+                    driver.get(link)
+                    try:
+                        WebDriverWait(driver, 10).until(
+                            EC.presence_of_element_located((By.CLASS_NAME, "singleArticle"))
+                        )
+                        article_soup = BeautifulSoup(driver.page_source, "html.parser")
+                        raw_content = " ".join([p.get_text(strip=True) for p in article_soup.find_all('p')])
+                        content = filter_content(raw_content)
+                        author, author_bio = extract_author_info(driver, article_soup)
+                    except:
+                        print(f"Error: Content did not load for article {headline}.")
+                
+                extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
+
+                if headline and link and date:
+                    author_info = extract_author_info(driver, article_soup, headline_pages=1)
+                    news_data.append({
+                        'headline': headline,
+                        'link': link,
+                        'content': content,
+                        'date': date,
+                        'author': author_info['name'],
+                        'author_bio': author_info['bio'],
+                        'contributor_since': author_info['contributor_since'],
+                        'other_articles': author_info['other_articles'],
+                        'keywords': extracted_keywords,
+                    })
+
+                pbar.set_postfix_str(f"Processing article: {headline[:40]}...")
+                pbar.update(1)
+
+            page_number += 1
+            time.sleep(2)
+
+    driver.quit()
+    return news_data
+
+def run_preprocessor():
+    file_path = os.path.join(DATA_DIR, 'preprocessed_oil_news.json')
+    news_data = scrape_oil_news()
+    save_to_json(news_data, file_path)
+
+if __name__ == "__main__":
+    run_preprocessor()
+
--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -11,6 +11,8 @@ import re

 OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/"
 DATA_DIR = os.path.join(os.getcwd(), "data")
+KEYWORD_FILE_PATH = os.path.join(os.getcwd(), "assets", "oil_key_words.txt")
+
 if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

@@ -39,15 +41,37 @@ def save_to_json(data, file_path):
        json.dump(combined_data, f, ensure_ascii=False, indent=4)
    print(f"Oil news data saved to {file_path}")

-def extract_keywords(text):
-    """Improved placeholder function to extract keywords from text."""
+def load_keyword_importance(file_path):
+    """Load keyword importance values from the oil_key_words.txt file."""
+    keyword_importance = {}
+    if os.path.exists(file_path):
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                parts = line.strip().split()
+                if len(parts) == 2:
+                    keyword, importance = parts
+                    keyword_importance[keyword.lower()] = int(importance)
+    else:
+        print(f"Keyword file not found at {file_path}")
+    return keyword_importance
+
+keyword_importance = load_keyword_importance(KEYWORD_FILE_PATH)
+
+def extract_keywords(text, keyword_importance):
+    """Extract important keywords from text based on an external keyword list."""
    words = re.findall(r'\b\w+\b', text.lower())
-    keywords = [word for word in words if len(word) > 3]  # Example filter: words longer than 3 chars
-    return list(set(keywords))[:10]  # Return up to 10 unique keywords
+    keywords = {}
+    
+    for word in words:
+        if len(word) > 3 and word in keyword_importance:
+            keywords[word] = keyword_importance[word]  # Store keyword with its importance
+
+    # Return up to 10 unique keywords with their importance
+    return sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]

 def analyze_sentiment(text):
-    """Placeholder function for sentiment analysis."""
-    # Basic placeholder logic (to be replaced with actual sentiment analysis)
+    """Basic sentiment analysis placeholder with minimal processing."""
+    # Only check for specific keywords; avoid complex logic to save time
    if "profit" in text or "rise" in text:
        return "Positive"
    elif "loss" in text or "decline" in text:
@@ -67,6 +91,7 @@ def scrape_oil_news():
    max_pages = 10  # Limit to 10 pages

    while page_number <= max_pages:
+        print(f"Processing page {page_number}...")
        driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
        
        try:
@@ -91,7 +116,8 @@ def scrape_oil_news():
            excerpt = article.find('p', class_='categoryArticle__excerpt').get_text(strip=True) if article.find('p', class_='categoryArticle__excerpt') else None
            author = date.split('|')[-1].strip() if '|' in date else "Unknown Author"
            timestamp = date.split('|')[0].strip() if '|' in date else date
-            
+            extracted_keywords = extract_keywords(headline + " " + excerpt if excerpt else headline, keyword_importance)
+
            if headline and link and date:
                news_data.append({
                    'headline': headline,
@@ -99,8 +125,9 @@ def scrape_oil_news():
                    'date': timestamp,
                    'author': author,
                    'excerpt': excerpt,
-                    'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
-                    'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
+                    'keywords': extracted_keywords,
+                    'sentiment_analysis': None
+                    #'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
                })

        page_number += 1