added author test, about to integrate into preprocessor.py

2024-10-31 15:42:16 -04:00
parent 69c4ada27b
commit 064486a747
4 changed files with 619 additions and 28 deletions
--- a/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
+++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak
@@ -82,37 +82,63 @@ def filter_content(content):
    content = re.sub(r'\s+', ' ', content).strip()
    return content

-def extract_author_info(driver, article_soup):
+def extract_author_info(driver, article_soup, headline_pages=1):
    """Extract detailed author information from the 'read more' link if available."""
    author = "Unknown Author"
    author_bio = ""
-    
+    contributor_since = ""
+    other_articles = []
+
    author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
    if author_tag:
-        try:
-            driver.get(author_tag['href'])
-            # Increased wait time to handle slow-loading pages
-            WebDriverWait(driver, 15).until(
-                EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
-            )
-            bio_soup = BeautifulSoup(driver.page_source, "html.parser")
-            
-            # Primary search for author name and bio
-            author_name_tag = bio_soup.find('h1')
-            author_bio_tag = bio_soup.find('p')
-            
-            # Fallback if primary elements are not found
-            if not author_name_tag or not author_bio_tag:
-                author_name_tag = bio_soup.find('span', class_='author-name')  # Hypothetical class for author name
-                author_bio_tag = bio_soup.find('div', class_='bio-content')   # Hypothetical class for bio content
-            
-            author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
-            author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
-        
-        except Exception as e:
-            print(f"Author bio page failed to load or extract. Error: {e}")
-    
-    return author, author_bio
+        retries = 3  # Set retry limit
+        for attempt in range(retries):
+            try:
+                driver.get(author_tag['href'])
+                WebDriverWait(driver, 15).until(
+                    EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
+                )
+                bio_soup = BeautifulSoup(driver.page_source, "html.parser")
+                
+                # Extract author's name
+                author_name_tag = bio_soup.find('h1')
+                author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+                # Extract author's bio description
+                author_bio_tag = bio_soup.find('p')
+                author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+                # Extract contributor since date
+                contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
+                if contributor_since_tag:
+                    contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
+
+                # Extract headlines of latest articles by the author, limited by `headline_pages`
+                for page in range(1, headline_pages + 1):
+                    driver.get(f"{author_tag['href']}Page-{page}.html")
+                    WebDriverWait(driver, 10).until(
+                        EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
+                    )
+                    page_soup = BeautifulSoup(driver.page_source, "html.parser")
+                    article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
+                    
+                    for article in article_tags:
+                        other_articles.append(article.get_text(strip=True))
+                
+                break  # Break loop if successful
+
+            except Exception as e:
+                print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
+                time.sleep(2)  # Wait before retrying
+                if attempt == retries - 1:
+                    print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
+
+    return {
+        "name": author,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }

 def scrape_oil_news():
    print("Scraping oil news articles for sentiment analysis...")
@@ -173,13 +199,16 @@ def scrape_oil_news():
                extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)

                if headline and link and date:
+                    author_info = extract_author_info(driver, article_soup, headline_pages=1)
                    news_data.append({
                        'headline': headline,
                        'link': link,
                        'content': content,
                        'date': date,
-                        'author': author,
-                        'author_bio': author_bio,
+                        'author': author_info['name'],
+                        'author_bio': author_info['bio'],
+                        'contributor_since': author_info['contributor_since'],
+                        'other_articles': author_info['other_articles'],
                        'keywords': extracted_keywords,
                    })

--- a/Data-Collection/WebScraper/scrapers/tests/author_info.json
+++ b/Data-Collection/WebScraper/scrapers/tests/author_info.json
@@ -0,0 +1,347 @@
+{
+    "name": "Charles Kennedy",
+    "bio": "Charles is a writer for Oilprice.com",
+    "contributor_since": "29 Sep 2011",
+    "other_articles": [
+        {
+            "heading": "Record Shale Production Helps ConocoPhillips Beat Profit Estimates",
+            "excerpt": "ConocoPhillips (NYSE: COP) is raising its ordinary dividend and share buyback program as its third-quarter earnings beat market expectations on the back of higher total…",
+            "keywords": [
+                "share",
+                "market",
+                "higher",
+                "back",
+                "total",
+                "expectations",
+                "third",
+                "beat",
+                "raising",
+                "conocophillips"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Rosneft to Resume Output at Idled Black Sea Refinery in November",
+            "excerpt": "Rosneft plans to resume crude processing at its Tuapse oil refinery on Russia’s Black Sea coast in November, after idling it for a month because…",
+            "keywords": [
+                "processing",
+                "idling",
+                "russia",
+                "plans",
+                "rosneft",
+                "refinery",
+                "tuapse",
+                "crude",
+                "november",
+                "black"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Canadian Natural Resources Q3 Profit Slips as Oil and Gas Prices Fall",
+            "excerpt": "Canada’s largest oil and gas producer, Canadian Natural Resources (NYSE: CNQ), reported lower adjusted net earnings from operations for the third quarter compared to a…",
+            "keywords": [
+                "canada",
+                "operations",
+                "producer",
+                "resources",
+                "reported",
+                "canadian",
+                "largest",
+                "third",
+                "natural",
+                "nyse"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Exelon Reports 80% Surge in Data Center Power Supply Deals",
+            "excerpt": "Exelon has seen an 80% increase in power supply deals coming from data enter operators in the latest sign that the IT industry is driving…",
+            "keywords": [
+                "industry",
+                "data",
+                "driving",
+                "seen",
+                "power",
+                "increase",
+                "exelon",
+                "deals",
+                "sign",
+                "that"
+            ],
+            "published_date": "31 October 2024"
+        },
+        {
+            "heading": "Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion",
+            "excerpt": "Gazprom is raising its investment plan for 2024 by 4% to $16.9 billion (1.642 trillion Russian rubles), thanks to rising exports and domestic supply, the…",
+            "keywords": [
+                "investment",
+                "russian",
+                "rubles",
+                "plan",
+                "exports",
+                "billion",
+                "raising",
+                "thanks",
+                "trillion",
+                "supply"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Investment Giants Form $50-Billion AI and Power Partnership",
+            "excerpt": "Global investment firm KKR and private-equity giant Energy Capital Partners on Wednesday announced a $50 billion strategic partnership to invest in data centers and power…",
+            "keywords": [
+                "centers",
+                "strategic",
+                "investment",
+                "giant",
+                "energy",
+                "capital",
+                "private",
+                "wednesday",
+                "billion",
+                "data"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE",
+            "excerpt": "Vietnam’s electric vehicle manufacturer VinFast Auto is expected to receive at least $1 billion in overseas funding led by Emirates Driving Company (EDC), Abu Dhabi’s…",
+            "keywords": [
+                "overseas",
+                "manufacturer",
+                "vietnam",
+                "expected",
+                "billion",
+                "driving",
+                "emirates",
+                "funding",
+                "receive",
+                "least"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "Chinese Oil Major to Explore Iraqi Field",
+            "excerpt": "China’s CNOOC has inked a deal for exploration at an oil field in central Iraq, the company said today.\nThe deposit, Block 7, will be…",
+            "keywords": [
+                "deposit",
+                "cnooc",
+                "iraq",
+                "field",
+                "central",
+                "deal",
+                "today",
+                "said",
+                "china",
+                "inked"
+            ],
+            "published_date": "30 October 2024"
+        },
+        {
+            "heading": "TotalEnergies to Produce More Gas Condensate Offshore Denmark",
+            "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
+            "keywords": [
+                "phillips",
+                "refining",
+                "giant",
+                "than",
+                "expected",
+                "higher",
+                "year",
+                "plunged",
+                "third",
+                "even"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3",
+            "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
+            "keywords": [
+                "phillips",
+                "refining",
+                "giant",
+                "than",
+                "expected",
+                "higher",
+                "year",
+                "plunged",
+                "third",
+                "even"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "UK Offshore Oil Platform Halted Due to Gas Compressor Issue",
+            "excerpt": "Production via the Triton Floating Production Storage & Offloading (FPSO) vessel in the UK North Sea has been halted due to a problem with the…",
+            "keywords": [
+                "fpso",
+                "been",
+                "with",
+                "problem",
+                "halted",
+                "storage",
+                "triton",
+                "vessel",
+                "offloading",
+                "north"
+            ],
+            "published_date": "29 October 2024"
+        },
+        {
+            "heading": "UAE’s Renewable Energy Giant Pushes Back Green Hydrogen Targets",
+            "excerpt": "Masdar, the clean energy giant of the United Arab Emirates (UAE), has pushed back its target to reach 1 million tons per year of green…",
+            "keywords": [
+                "united",
+                "energy",
+                "giant",
+                "emirates",
+                "back",
+                "year",
+                "million",
+                "arab",
+                "pushed",
+                "target"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "Profit at India’s Top Refiner Slumps by 99% Due to Weak Margins",
+            "excerpt": "IndianOil, the biggest refiner in India, reported on Monday a net profit tumbling by 98.6% in the quarter to September from a year ago amid…",
+            "keywords": [
+                "refiner",
+                "monday",
+                "september",
+                "biggest",
+                "reported",
+                "indianoil",
+                "india",
+                "year",
+                "tumbling",
+                "profit"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "Average U.S. Gasoline Price Set to Drop Below $3 for the First Time Since 2021",
+            "excerpt": "The U.S. national average price of gasoline is set to soon fall below $3 per gallon for the first time since 2021, amid lower seasonal…",
+            "keywords": [
+                "gasoline",
+                "national",
+                "below",
+                "gallon",
+                "soon",
+                "first",
+                "lower",
+                "average",
+                "seasonal",
+                "price"
+            ],
+            "published_date": "28 October 2024"
+        },
+        {
+            "heading": "FERC Grants Exxon and Qatar Three-Year Extension to Build Golden Pass LNG",
+            "excerpt": "The U.S. Federal Energy Regulatory Commission has granted a three-year extension to ExxonMobil and QatarEnergy to build their $10-billion Golden Pass LNG export plant in…",
+            "keywords": [
+                "federal",
+                "export",
+                "three",
+                "energy",
+                "golden",
+                "billion",
+                "year",
+                "their",
+                "qatarenergy",
+                "regulatory"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "Cepsa: Windfall Tax Would Delay Its $3.3-Billion Hydrogen Plan",
+            "excerpt": "Cepsa, Spain’s second-largest oil company, will delay its $3.25 billion (3 billion euros) investment into domestic green hydrogen projects if Spain makes the windfall tax…",
+            "keywords": [
+                "investment",
+                "second",
+                "projects",
+                "billion",
+                "euros",
+                "largest",
+                "into",
+                "delay",
+                "will",
+                "cepsa"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "South Africa Seeks Loan Guarantees for Energy Transition Funding",
+            "excerpt": "South Africa is currently negotiating loan guarantees with its international partners in its $9.3-billion Just Energy Transition Partnership (JETP) program for energy investment.\nThe International…",
+            "keywords": [
+                "jetp",
+                "negotiating",
+                "energy",
+                "transition",
+                "currently",
+                "investment",
+                "billion",
+                "south",
+                "africa",
+                "guarantees"
+            ],
+            "published_date": "25 October 2024"
+        },
+        {
+            "heading": "Saudi Oil Export Revenues Hit Three-Year Low as Prices Decline",
+            "excerpt": "Lower crude oil prices dragged Saudi Arabia’s oil export revenues to the lowest level in more than three years in August, amid underwhelming oil demand…",
+            "keywords": [
+                "years",
+                "three",
+                "august",
+                "than",
+                "more",
+                "dragged",
+                "revenues",
+                "saudi",
+                "crude",
+                "prices"
+            ],
+            "published_date": "24 October 2024"
+        },
+        {
+            "heading": "Tesla Stock Soars After Q3 Earnings Beat",
+            "excerpt": "Tesla (NASDAQ: TSLA) saw its shares jump by 20% after hours on Wednesday and another 14% in pre-market trade on Thursday after reporting earnings for…",
+            "keywords": [
+                "thursday",
+                "after",
+                "trade",
+                "market",
+                "tesla",
+                "wednesday",
+                "another",
+                "nasdaq",
+                "hours",
+                "reporting"
+            ],
+            "published_date": "24 October 2024"
+        },
+        {
+            "heading": "Oil Refining Giant Valero Tops Estimates Despite Q3 Profit Plunge",
+            "excerpt": "One of the biggest U.S. refiners, Valero Energy (NYSE: VLO), beat Wall Street estimates even as it reported a widely expected plunge in its third-quarter…",
+            "keywords": [
+                "street",
+                "energy",
+                "biggest",
+                "wall",
+                "reported",
+                "expected",
+                "plunge",
+                "widely",
+                "third",
+                "valero"
+            ],
+            "published_date": "24 October 2024"
+        }
+    ]
+}
--- a/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
+++ b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
@@ -0,0 +1,109 @@
+import json
+import re
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+
+AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy"  # Replace with actual author URL
+OUTPUT_FILE = "author_info.json"
+
+def extract_keywords(text):
+    """Basic keyword extraction by finding unique words longer than 3 characters."""
+    words = re.findall(r'\b\w{4,}\b', text.lower())
+    keywords = list(set(words))
+    return keywords[:10]  # Limit to top 10 unique keywords for simplicity
+
+def scrape_author_info(author_url, headline_pages=1):
+    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    author_name = "Unknown"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    try:
+        # Load author page
+        driver.get(author_url)
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.TAG_NAME, "h1"))
+        )
+        page_source = driver.page_source
+        bio_soup = BeautifulSoup(page_source, "html.parser")
+
+        # Extract author name
+        author_name_tag = bio_soup.find('h1')
+        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+        # Extract author bio
+        author_bio_tag = bio_soup.find('div', class_='biography')
+        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+        # Extract contributor since date
+        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
+        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
+
+        # Extract latest articles by author with heading, excerpt, keywords, and timestamp
+        for page in range(1, headline_pages + 1):
+            driver.get(f"{author_url}/Page-{page}.html")
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
+            )
+            page_soup = BeautifulSoup(driver.page_source, "html.parser")
+            article_tags = page_soup.find_all('li', class_='clear')
+            
+            for article in article_tags:
+                heading_tag = article.find('h3')
+                excerpt_tag = article.find('p', class_='articlecontent')
+                timestamp_tag = article.find('div', class_='meta')
+
+                if heading_tag and excerpt_tag and timestamp_tag:
+                    heading = heading_tag.get_text(strip=True)
+                    excerpt = excerpt_tag.get_text(strip=True)
+                    timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
+                    keywords = extract_keywords(excerpt)
+                    
+                    other_articles.append({
+                        "heading": heading,
+                        "excerpt": excerpt,
+                        "keywords": keywords,
+                        "published_date": timestamp
+                    })
+
+    except Exception as e:
+        print(f"Error scraping author info: {e}")
+        author_name = "Error Occurred"
+        author_bio = str(e)
+        contributor_since = "N/A"
+        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
+
+    finally:
+        driver.quit()
+
+    return {
+        "name": author_name,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def save_to_json(data, output_file):
+    """Save author info to a JSON file."""
+    with open(output_file, mode="w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+    print(f"Author info saved to {output_file}")
+
+if __name__ == "__main__":
+    # Scrape author info
+    author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
+
+    # Save to JSON
+    save_to_json(author_info, OUTPUT_FILE)
+
--- a/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak
+++ b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak
@@ -0,0 +1,106 @@
+import json
+import re
+import time
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from bs4 import BeautifulSoup
+
+AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy"  # Replace with actual author URL
+OUTPUT_FILE = "author_info.json"
+
+def extract_keywords(text):
+    """Basic keyword extraction by finding unique words longer than 3 characters."""
+    words = re.findall(r'\b\w{4,}\b', text.lower())
+    keywords = list(set(words))
+    return keywords[:10]  # Limit to top 10 unique keywords for simplicity
+
+def scrape_author_info(author_url, headline_pages=1):
+    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
+    options = Options()
+    options.headless = True
+    driver = webdriver.Firefox(options=options)
+
+    author_name = "Unknown"
+    author_bio = ""
+    contributor_since = ""
+    other_articles = []
+
+    try:
+        # Load author page
+        driver.get(author_url)
+        WebDriverWait(driver, 15).until(
+            EC.presence_of_element_located((By.TAG_NAME, "h1"))
+        )
+        page_source = driver.page_source
+        bio_soup = BeautifulSoup(page_source, "html.parser")
+
+        # Extract author name
+        author_name_tag = bio_soup.find('h1')
+        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
+
+        # Extract author bio
+        author_bio_tag = bio_soup.find('div', class_='biography')
+        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
+
+        # Extract contributor since date
+        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
+        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
+
+        # Extract latest articles by author with heading, excerpt, and keywords
+        for page in range(1, headline_pages + 1):
+            driver.get(f"{author_url}/Page-{page}.html")
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
+            )
+            page_soup = BeautifulSoup(driver.page_source, "html.parser")
+            article_tags = page_soup.find_all('li', class_='clear')
+            
+            for article in article_tags:
+                heading_tag = article.find('h3')
+                excerpt_tag = article.find('p', class_='articlecontent')
+                
+                if heading_tag and excerpt_tag:
+                    heading = heading_tag.get_text(strip=True)
+                    excerpt = excerpt_tag.get_text(strip=True)
+                    keywords = extract_keywords(excerpt)
+                    
+                    other_articles.append({
+                        "heading": heading,
+                        "excerpt": excerpt,
+                        "keywords": keywords
+                    })
+
+    except Exception as e:
+        print(f"Error scraping author info: {e}")
+        author_name = "Error Occurred"
+        author_bio = str(e)
+        contributor_since = "N/A"
+        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
+
+    finally:
+        driver.quit()
+
+    return {
+        "name": author_name,
+        "bio": author_bio,
+        "contributor_since": contributor_since,
+        "other_articles": other_articles
+    }
+
+def save_to_json(data, output_file):
+    """Save author info to a JSON file."""
+    with open(output_file, mode="w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=4)
+
+    print(f"Author info saved to {output_file}")
+
+if __name__ == "__main__":
+    # Scrape author info
+    author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
+
+    # Save to JSON
+    save_to_json(author_info, OUTPUT_FILE)
+