diff --git a/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak b/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak index be787e4..17a8f63 100644 --- a/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak +++ b/Data-Collection/WebScraper/scrapers/backups/oil_news_preprocessor.py.bak @@ -82,37 +82,63 @@ def filter_content(content): content = re.sub(r'\s+', ' ', content).strip() return content -def extract_author_info(driver, article_soup): +def extract_author_info(driver, article_soup, headline_pages=1): """Extract detailed author information from the 'read more' link if available.""" author = "Unknown Author" author_bio = "" - + contributor_since = "" + other_articles = [] + author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE)) if author_tag: - try: - driver.get(author_tag['href']) - # Increased wait time to handle slow-loading pages - WebDriverWait(driver, 15).until( - EC.presence_of_element_located((By.CLASS_NAME, "authorBio")) - ) - bio_soup = BeautifulSoup(driver.page_source, "html.parser") - - # Primary search for author name and bio - author_name_tag = bio_soup.find('h1') - author_bio_tag = bio_soup.find('p') - - # Fallback if primary elements are not found - if not author_name_tag or not author_bio_tag: - author_name_tag = bio_soup.find('span', class_='author-name') # Hypothetical class for author name - author_bio_tag = bio_soup.find('div', class_='bio-content') # Hypothetical class for bio content - - author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author" - author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available" - - except Exception as e: - print(f"Author bio page failed to load or extract. Error: {e}") - - return author, author_bio + retries = 3 # Set retry limit + for attempt in range(retries): + try: + driver.get(author_tag['href']) + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.CLASS_NAME, "authorBio")) + ) + bio_soup = BeautifulSoup(driver.page_source, "html.parser") + + # Extract author's name + author_name_tag = bio_soup.find('h1') + author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author" + + # Extract author's bio description + author_bio_tag = bio_soup.find('p') + author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available" + + # Extract contributor since date + contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE)) + if contributor_since_tag: + contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "") + + # Extract headlines of latest articles by the author, limited by `headline_pages` + for page in range(1, headline_pages + 1): + driver.get(f"{author_tag['href']}Page-{page}.html") + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle")) + ) + page_soup = BeautifulSoup(driver.page_source, "html.parser") + article_tags = page_soup.find_all('h2', class_='categoryArticle__title') + + for article in article_tags: + other_articles.append(article.get_text(strip=True)) + + break # Break loop if successful + + except Exception as e: + print(f"Attempt {attempt + 1} failed for author bio page. Retrying...") + time.sleep(2) # Wait before retrying + if attempt == retries - 1: + print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}") + + return { + "name": author, + "bio": author_bio, + "contributor_since": contributor_since, + "other_articles": other_articles + } def scrape_oil_news(): print("Scraping oil news articles for sentiment analysis...") @@ -173,13 +199,16 @@ def scrape_oil_news(): extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance) if headline and link and date: + author_info = extract_author_info(driver, article_soup, headline_pages=1) news_data.append({ 'headline': headline, 'link': link, 'content': content, 'date': date, - 'author': author, - 'author_bio': author_bio, + 'author': author_info['name'], + 'author_bio': author_info['bio'], + 'contributor_since': author_info['contributor_since'], + 'other_articles': author_info['other_articles'], 'keywords': extracted_keywords, }) diff --git a/Data-Collection/WebScraper/scrapers/tests/author_info.json b/Data-Collection/WebScraper/scrapers/tests/author_info.json new file mode 100644 index 0000000..8b5b2a5 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/tests/author_info.json @@ -0,0 +1,347 @@ +{ + "name": "Charles Kennedy", + "bio": "Charles is a writer for Oilprice.com", + "contributor_since": "29 Sep 2011", + "other_articles": [ + { + "heading": "Record Shale Production Helps ConocoPhillips Beat Profit Estimates", + "excerpt": "ConocoPhillips (NYSE: COP) is raising its ordinary dividend and share buyback program as its third-quarter earnings beat market expectations on the back of higher total…", + "keywords": [ + "share", + "market", + "higher", + "back", + "total", + "expectations", + "third", + "beat", + "raising", + "conocophillips" + ], + "published_date": "31 October 2024" + }, + { + "heading": "Rosneft to Resume Output at Idled Black Sea Refinery in November", + "excerpt": "Rosneft plans to resume crude processing at its Tuapse oil refinery on Russia’s Black Sea coast in November, after idling it for a month because…", + "keywords": [ + "processing", + "idling", + "russia", + "plans", + "rosneft", + "refinery", + "tuapse", + "crude", + "november", + "black" + ], + "published_date": "31 October 2024" + }, + { + "heading": "Canadian Natural Resources Q3 Profit Slips as Oil and Gas Prices Fall", + "excerpt": "Canada’s largest oil and gas producer, Canadian Natural Resources (NYSE: CNQ), reported lower adjusted net earnings from operations for the third quarter compared to a…", + "keywords": [ + "canada", + "operations", + "producer", + "resources", + "reported", + "canadian", + "largest", + "third", + "natural", + "nyse" + ], + "published_date": "31 October 2024" + }, + { + "heading": "Exelon Reports 80% Surge in Data Center Power Supply Deals", + "excerpt": "Exelon has seen an 80% increase in power supply deals coming from data enter operators in the latest sign that the IT industry is driving…", + "keywords": [ + "industry", + "data", + "driving", + "seen", + "power", + "increase", + "exelon", + "deals", + "sign", + "that" + ], + "published_date": "31 October 2024" + }, + { + "heading": "Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion", + "excerpt": "Gazprom is raising its investment plan for 2024 by 4% to $16.9 billion (1.642 trillion Russian rubles), thanks to rising exports and domestic supply, the…", + "keywords": [ + "investment", + "russian", + "rubles", + "plan", + "exports", + "billion", + "raising", + "thanks", + "trillion", + "supply" + ], + "published_date": "30 October 2024" + }, + { + "heading": "Investment Giants Form $50-Billion AI and Power Partnership", + "excerpt": "Global investment firm KKR and private-equity giant Energy Capital Partners on Wednesday announced a $50 billion strategic partnership to invest in data centers and power…", + "keywords": [ + "centers", + "strategic", + "investment", + "giant", + "energy", + "capital", + "private", + "wednesday", + "billion", + "data" + ], + "published_date": "30 October 2024" + }, + { + "heading": "Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE", + "excerpt": "Vietnam’s electric vehicle manufacturer VinFast Auto is expected to receive at least $1 billion in overseas funding led by Emirates Driving Company (EDC), Abu Dhabi’s…", + "keywords": [ + "overseas", + "manufacturer", + "vietnam", + "expected", + "billion", + "driving", + "emirates", + "funding", + "receive", + "least" + ], + "published_date": "30 October 2024" + }, + { + "heading": "Chinese Oil Major to Explore Iraqi Field", + "excerpt": "China’s CNOOC has inked a deal for exploration at an oil field in central Iraq, the company said today.\nThe deposit, Block 7, will be…", + "keywords": [ + "deposit", + "cnooc", + "iraq", + "field", + "central", + "deal", + "today", + "said", + "china", + "inked" + ], + "published_date": "30 October 2024" + }, + { + "heading": "TotalEnergies to Produce More Gas Condensate Offshore Denmark", + "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…", + "keywords": [ + "phillips", + "refining", + "giant", + "than", + "expected", + "higher", + "year", + "plunged", + "third", + "even" + ], + "published_date": "29 October 2024" + }, + { + "heading": "Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3", + "excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…", + "keywords": [ + "phillips", + "refining", + "giant", + "than", + "expected", + "higher", + "year", + "plunged", + "third", + "even" + ], + "published_date": "29 October 2024" + }, + { + "heading": "UK Offshore Oil Platform Halted Due to Gas Compressor Issue", + "excerpt": "Production via the Triton Floating Production Storage & Offloading (FPSO) vessel in the UK North Sea has been halted due to a problem with the…", + "keywords": [ + "fpso", + "been", + "with", + "problem", + "halted", + "storage", + "triton", + "vessel", + "offloading", + "north" + ], + "published_date": "29 October 2024" + }, + { + "heading": "UAE’s Renewable Energy Giant Pushes Back Green Hydrogen Targets", + "excerpt": "Masdar, the clean energy giant of the United Arab Emirates (UAE), has pushed back its target to reach 1 million tons per year of green…", + "keywords": [ + "united", + "energy", + "giant", + "emirates", + "back", + "year", + "million", + "arab", + "pushed", + "target" + ], + "published_date": "28 October 2024" + }, + { + "heading": "Profit at India’s Top Refiner Slumps by 99% Due to Weak Margins", + "excerpt": "IndianOil, the biggest refiner in India, reported on Monday a net profit tumbling by 98.6% in the quarter to September from a year ago amid…", + "keywords": [ + "refiner", + "monday", + "september", + "biggest", + "reported", + "indianoil", + "india", + "year", + "tumbling", + "profit" + ], + "published_date": "28 October 2024" + }, + { + "heading": "Average U.S. Gasoline Price Set to Drop Below $3 for the First Time Since 2021", + "excerpt": "The U.S. national average price of gasoline is set to soon fall below $3 per gallon for the first time since 2021, amid lower seasonal…", + "keywords": [ + "gasoline", + "national", + "below", + "gallon", + "soon", + "first", + "lower", + "average", + "seasonal", + "price" + ], + "published_date": "28 October 2024" + }, + { + "heading": "FERC Grants Exxon and Qatar Three-Year Extension to Build Golden Pass LNG", + "excerpt": "The U.S. Federal Energy Regulatory Commission has granted a three-year extension to ExxonMobil and QatarEnergy to build their $10-billion Golden Pass LNG export plant in…", + "keywords": [ + "federal", + "export", + "three", + "energy", + "golden", + "billion", + "year", + "their", + "qatarenergy", + "regulatory" + ], + "published_date": "25 October 2024" + }, + { + "heading": "Cepsa: Windfall Tax Would Delay Its $3.3-Billion Hydrogen Plan", + "excerpt": "Cepsa, Spain’s second-largest oil company, will delay its $3.25 billion (3 billion euros) investment into domestic green hydrogen projects if Spain makes the windfall tax…", + "keywords": [ + "investment", + "second", + "projects", + "billion", + "euros", + "largest", + "into", + "delay", + "will", + "cepsa" + ], + "published_date": "25 October 2024" + }, + { + "heading": "South Africa Seeks Loan Guarantees for Energy Transition Funding", + "excerpt": "South Africa is currently negotiating loan guarantees with its international partners in its $9.3-billion Just Energy Transition Partnership (JETP) program for energy investment.\nThe International…", + "keywords": [ + "jetp", + "negotiating", + "energy", + "transition", + "currently", + "investment", + "billion", + "south", + "africa", + "guarantees" + ], + "published_date": "25 October 2024" + }, + { + "heading": "Saudi Oil Export Revenues Hit Three-Year Low as Prices Decline", + "excerpt": "Lower crude oil prices dragged Saudi Arabia’s oil export revenues to the lowest level in more than three years in August, amid underwhelming oil demand…", + "keywords": [ + "years", + "three", + "august", + "than", + "more", + "dragged", + "revenues", + "saudi", + "crude", + "prices" + ], + "published_date": "24 October 2024" + }, + { + "heading": "Tesla Stock Soars After Q3 Earnings Beat", + "excerpt": "Tesla (NASDAQ: TSLA) saw its shares jump by 20% after hours on Wednesday and another 14% in pre-market trade on Thursday after reporting earnings for…", + "keywords": [ + "thursday", + "after", + "trade", + "market", + "tesla", + "wednesday", + "another", + "nasdaq", + "hours", + "reporting" + ], + "published_date": "24 October 2024" + }, + { + "heading": "Oil Refining Giant Valero Tops Estimates Despite Q3 Profit Plunge", + "excerpt": "One of the biggest U.S. refiners, Valero Energy (NYSE: VLO), beat Wall Street estimates even as it reported a widely expected plunge in its third-quarter…", + "keywords": [ + "street", + "energy", + "biggest", + "wall", + "reported", + "expected", + "plunge", + "widely", + "third", + "valero" + ], + "published_date": "24 October 2024" + } + ] +} \ No newline at end of file diff --git a/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py new file mode 100644 index 0000000..8004b42 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py @@ -0,0 +1,109 @@ +import json +import re +import time +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup + +AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL +OUTPUT_FILE = "author_info.json" + +def extract_keywords(text): + """Basic keyword extraction by finding unique words longer than 3 characters.""" + words = re.findall(r'\b\w{4,}\b', text.lower()) + keywords = list(set(words)) + return keywords[:10] # Limit to top 10 unique keywords for simplicity + +def scrape_author_info(author_url, headline_pages=1): + """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp.""" + options = Options() + options.headless = True + driver = webdriver.Firefox(options=options) + + author_name = "Unknown" + author_bio = "" + contributor_since = "" + other_articles = [] + + try: + # Load author page + driver.get(author_url) + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.TAG_NAME, "h1")) + ) + page_source = driver.page_source + bio_soup = BeautifulSoup(page_source, "html.parser") + + # Extract author name + author_name_tag = bio_soup.find('h1') + author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author" + + # Extract author bio + author_bio_tag = bio_soup.find('div', class_='biography') + author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available" + + # Extract contributor since date + contributor_since_tag = bio_soup.find('p', class_='contributor_since') + contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date" + + # Extract latest articles by author with heading, excerpt, keywords, and timestamp + for page in range(1, headline_pages + 1): + driver.get(f"{author_url}/Page-{page}.html") + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "articles")) + ) + page_soup = BeautifulSoup(driver.page_source, "html.parser") + article_tags = page_soup.find_all('li', class_='clear') + + for article in article_tags: + heading_tag = article.find('h3') + excerpt_tag = article.find('p', class_='articlecontent') + timestamp_tag = article.find('div', class_='meta') + + if heading_tag and excerpt_tag and timestamp_tag: + heading = heading_tag.get_text(strip=True) + excerpt = excerpt_tag.get_text(strip=True) + timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip() + keywords = extract_keywords(excerpt) + + other_articles.append({ + "heading": heading, + "excerpt": excerpt, + "keywords": keywords, + "published_date": timestamp + }) + + except Exception as e: + print(f"Error scraping author info: {e}") + author_name = "Error Occurred" + author_bio = str(e) + contributor_since = "N/A" + other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}] + + finally: + driver.quit() + + return { + "name": author_name, + "bio": author_bio, + "contributor_since": contributor_since, + "other_articles": other_articles + } + +def save_to_json(data, output_file): + """Save author info to a JSON file.""" + with open(output_file, mode="w", encoding="utf-8") as file: + json.dump(data, file, ensure_ascii=False, indent=4) + + print(f"Author info saved to {output_file}") + +if __name__ == "__main__": + # Scrape author info + author_info = scrape_author_info(AUTHOR_URL, headline_pages=1) + + # Save to JSON + save_to_json(author_info, OUTPUT_FILE) + diff --git a/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak new file mode 100644 index 0000000..a32c2f2 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak @@ -0,0 +1,106 @@ +import json +import re +import time +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from bs4 import BeautifulSoup + +AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL +OUTPUT_FILE = "author_info.json" + +def extract_keywords(text): + """Basic keyword extraction by finding unique words longer than 3 characters.""" + words = re.findall(r'\b\w{4,}\b', text.lower()) + keywords = list(set(words)) + return keywords[:10] # Limit to top 10 unique keywords for simplicity + +def scrape_author_info(author_url, headline_pages=1): + """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords.""" + options = Options() + options.headless = True + driver = webdriver.Firefox(options=options) + + author_name = "Unknown" + author_bio = "" + contributor_since = "" + other_articles = [] + + try: + # Load author page + driver.get(author_url) + WebDriverWait(driver, 15).until( + EC.presence_of_element_located((By.TAG_NAME, "h1")) + ) + page_source = driver.page_source + bio_soup = BeautifulSoup(page_source, "html.parser") + + # Extract author name + author_name_tag = bio_soup.find('h1') + author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author" + + # Extract author bio + author_bio_tag = bio_soup.find('div', class_='biography') + author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available" + + # Extract contributor since date + contributor_since_tag = bio_soup.find('p', class_='contributor_since') + contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date" + + # Extract latest articles by author with heading, excerpt, and keywords + for page in range(1, headline_pages + 1): + driver.get(f"{author_url}/Page-{page}.html") + WebDriverWait(driver, 10).until( + EC.presence_of_element_located((By.CLASS_NAME, "articles")) + ) + page_soup = BeautifulSoup(driver.page_source, "html.parser") + article_tags = page_soup.find_all('li', class_='clear') + + for article in article_tags: + heading_tag = article.find('h3') + excerpt_tag = article.find('p', class_='articlecontent') + + if heading_tag and excerpt_tag: + heading = heading_tag.get_text(strip=True) + excerpt = excerpt_tag.get_text(strip=True) + keywords = extract_keywords(excerpt) + + other_articles.append({ + "heading": heading, + "excerpt": excerpt, + "keywords": keywords + }) + + except Exception as e: + print(f"Error scraping author info: {e}") + author_name = "Error Occurred" + author_bio = str(e) + contributor_since = "N/A" + other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}] + + finally: + driver.quit() + + return { + "name": author_name, + "bio": author_bio, + "contributor_since": contributor_since, + "other_articles": other_articles + } + +def save_to_json(data, output_file): + """Save author info to a JSON file.""" + with open(output_file, mode="w", encoding="utf-8") as file: + json.dump(data, file, ensure_ascii=False, indent=4) + + print(f"Author info saved to {output_file}") + +if __name__ == "__main__": + # Scrape author info + author_info = scrape_author_info(AUTHOR_URL, headline_pages=1) + + # Save to JSON + save_to_json(author_info, OUTPUT_FILE) +