added author test, about to integrate into preprocessor.py
This commit is contained in:
@@ -82,37 +82,63 @@ def filter_content(content):
|
|||||||
content = re.sub(r'\s+', ' ', content).strip()
|
content = re.sub(r'\s+', ' ', content).strip()
|
||||||
return content
|
return content
|
||||||
|
|
||||||
def extract_author_info(driver, article_soup):
|
def extract_author_info(driver, article_soup, headline_pages=1):
|
||||||
"""Extract detailed author information from the 'read more' link if available."""
|
"""Extract detailed author information from the 'read more' link if available."""
|
||||||
author = "Unknown Author"
|
author = "Unknown Author"
|
||||||
author_bio = ""
|
author_bio = ""
|
||||||
|
contributor_since = ""
|
||||||
|
other_articles = []
|
||||||
|
|
||||||
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
|
author_tag = article_soup.find('a', text=re.compile(r'More Info|Read More', re.IGNORECASE))
|
||||||
if author_tag:
|
if author_tag:
|
||||||
try:
|
retries = 3 # Set retry limit
|
||||||
driver.get(author_tag['href'])
|
for attempt in range(retries):
|
||||||
# Increased wait time to handle slow-loading pages
|
try:
|
||||||
WebDriverWait(driver, 15).until(
|
driver.get(author_tag['href'])
|
||||||
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
|
WebDriverWait(driver, 15).until(
|
||||||
)
|
EC.presence_of_element_located((By.CLASS_NAME, "authorBio"))
|
||||||
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
|
)
|
||||||
|
bio_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
|
||||||
# Primary search for author name and bio
|
# Extract author's name
|
||||||
author_name_tag = bio_soup.find('h1')
|
author_name_tag = bio_soup.find('h1')
|
||||||
author_bio_tag = bio_soup.find('p')
|
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||||
|
|
||||||
# Fallback if primary elements are not found
|
# Extract author's bio description
|
||||||
if not author_name_tag or not author_bio_tag:
|
author_bio_tag = bio_soup.find('p')
|
||||||
author_name_tag = bio_soup.find('span', class_='author-name') # Hypothetical class for author name
|
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||||
author_bio_tag = bio_soup.find('div', class_='bio-content') # Hypothetical class for bio content
|
|
||||||
|
|
||||||
author = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
# Extract contributor since date
|
||||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
contributor_since_tag = bio_soup.find(text=re.compile(r"Contributor since", re.IGNORECASE))
|
||||||
|
if contributor_since_tag:
|
||||||
|
contributor_since = contributor_since_tag.parent.get_text(strip=True).replace("Contributor since: ", "")
|
||||||
|
|
||||||
except Exception as e:
|
# Extract headlines of latest articles by the author, limited by `headline_pages`
|
||||||
print(f"Author bio page failed to load or extract. Error: {e}")
|
for page in range(1, headline_pages + 1):
|
||||||
|
driver.get(f"{author_tag['href']}Page-{page}.html")
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "categoryArticle"))
|
||||||
|
)
|
||||||
|
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
article_tags = page_soup.find_all('h2', class_='categoryArticle__title')
|
||||||
|
|
||||||
return author, author_bio
|
for article in article_tags:
|
||||||
|
other_articles.append(article.get_text(strip=True))
|
||||||
|
|
||||||
|
break # Break loop if successful
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Attempt {attempt + 1} failed for author bio page. Retrying...")
|
||||||
|
time.sleep(2) # Wait before retrying
|
||||||
|
if attempt == retries - 1:
|
||||||
|
print(f"Author bio page failed to load or extract after {retries} attempts. Error: {e}")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": author,
|
||||||
|
"bio": author_bio,
|
||||||
|
"contributor_since": contributor_since,
|
||||||
|
"other_articles": other_articles
|
||||||
|
}
|
||||||
|
|
||||||
def scrape_oil_news():
|
def scrape_oil_news():
|
||||||
print("Scraping oil news articles for sentiment analysis...")
|
print("Scraping oil news articles for sentiment analysis...")
|
||||||
@@ -173,13 +199,16 @@ def scrape_oil_news():
|
|||||||
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
|
extracted_keywords = extract_keywords(f"{headline} {content}", keyword_importance)
|
||||||
|
|
||||||
if headline and link and date:
|
if headline and link and date:
|
||||||
|
author_info = extract_author_info(driver, article_soup, headline_pages=1)
|
||||||
news_data.append({
|
news_data.append({
|
||||||
'headline': headline,
|
'headline': headline,
|
||||||
'link': link,
|
'link': link,
|
||||||
'content': content,
|
'content': content,
|
||||||
'date': date,
|
'date': date,
|
||||||
'author': author,
|
'author': author_info['name'],
|
||||||
'author_bio': author_bio,
|
'author_bio': author_info['bio'],
|
||||||
|
'contributor_since': author_info['contributor_since'],
|
||||||
|
'other_articles': author_info['other_articles'],
|
||||||
'keywords': extracted_keywords,
|
'keywords': extracted_keywords,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|||||||
347
Data-Collection/WebScraper/scrapers/tests/author_info.json
Normal file
347
Data-Collection/WebScraper/scrapers/tests/author_info.json
Normal file
@@ -0,0 +1,347 @@
|
|||||||
|
{
|
||||||
|
"name": "Charles Kennedy",
|
||||||
|
"bio": "Charles is a writer for Oilprice.com",
|
||||||
|
"contributor_since": "29 Sep 2011",
|
||||||
|
"other_articles": [
|
||||||
|
{
|
||||||
|
"heading": "Record Shale Production Helps ConocoPhillips Beat Profit Estimates",
|
||||||
|
"excerpt": "ConocoPhillips (NYSE: COP) is raising its ordinary dividend and share buyback program as its third-quarter earnings beat market expectations on the back of higher total…",
|
||||||
|
"keywords": [
|
||||||
|
"share",
|
||||||
|
"market",
|
||||||
|
"higher",
|
||||||
|
"back",
|
||||||
|
"total",
|
||||||
|
"expectations",
|
||||||
|
"third",
|
||||||
|
"beat",
|
||||||
|
"raising",
|
||||||
|
"conocophillips"
|
||||||
|
],
|
||||||
|
"published_date": "31 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Rosneft to Resume Output at Idled Black Sea Refinery in November",
|
||||||
|
"excerpt": "Rosneft plans to resume crude processing at its Tuapse oil refinery on Russia’s Black Sea coast in November, after idling it for a month because…",
|
||||||
|
"keywords": [
|
||||||
|
"processing",
|
||||||
|
"idling",
|
||||||
|
"russia",
|
||||||
|
"plans",
|
||||||
|
"rosneft",
|
||||||
|
"refinery",
|
||||||
|
"tuapse",
|
||||||
|
"crude",
|
||||||
|
"november",
|
||||||
|
"black"
|
||||||
|
],
|
||||||
|
"published_date": "31 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Canadian Natural Resources Q3 Profit Slips as Oil and Gas Prices Fall",
|
||||||
|
"excerpt": "Canada’s largest oil and gas producer, Canadian Natural Resources (NYSE: CNQ), reported lower adjusted net earnings from operations for the third quarter compared to a…",
|
||||||
|
"keywords": [
|
||||||
|
"canada",
|
||||||
|
"operations",
|
||||||
|
"producer",
|
||||||
|
"resources",
|
||||||
|
"reported",
|
||||||
|
"canadian",
|
||||||
|
"largest",
|
||||||
|
"third",
|
||||||
|
"natural",
|
||||||
|
"nyse"
|
||||||
|
],
|
||||||
|
"published_date": "31 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Exelon Reports 80% Surge in Data Center Power Supply Deals",
|
||||||
|
"excerpt": "Exelon has seen an 80% increase in power supply deals coming from data enter operators in the latest sign that the IT industry is driving…",
|
||||||
|
"keywords": [
|
||||||
|
"industry",
|
||||||
|
"data",
|
||||||
|
"driving",
|
||||||
|
"seen",
|
||||||
|
"power",
|
||||||
|
"increase",
|
||||||
|
"exelon",
|
||||||
|
"deals",
|
||||||
|
"sign",
|
||||||
|
"that"
|
||||||
|
],
|
||||||
|
"published_date": "31 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion",
|
||||||
|
"excerpt": "Gazprom is raising its investment plan for 2024 by 4% to $16.9 billion (1.642 trillion Russian rubles), thanks to rising exports and domestic supply, the…",
|
||||||
|
"keywords": [
|
||||||
|
"investment",
|
||||||
|
"russian",
|
||||||
|
"rubles",
|
||||||
|
"plan",
|
||||||
|
"exports",
|
||||||
|
"billion",
|
||||||
|
"raising",
|
||||||
|
"thanks",
|
||||||
|
"trillion",
|
||||||
|
"supply"
|
||||||
|
],
|
||||||
|
"published_date": "30 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Investment Giants Form $50-Billion AI and Power Partnership",
|
||||||
|
"excerpt": "Global investment firm KKR and private-equity giant Energy Capital Partners on Wednesday announced a $50 billion strategic partnership to invest in data centers and power…",
|
||||||
|
"keywords": [
|
||||||
|
"centers",
|
||||||
|
"strategic",
|
||||||
|
"investment",
|
||||||
|
"giant",
|
||||||
|
"energy",
|
||||||
|
"capital",
|
||||||
|
"private",
|
||||||
|
"wednesday",
|
||||||
|
"billion",
|
||||||
|
"data"
|
||||||
|
],
|
||||||
|
"published_date": "30 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE",
|
||||||
|
"excerpt": "Vietnam’s electric vehicle manufacturer VinFast Auto is expected to receive at least $1 billion in overseas funding led by Emirates Driving Company (EDC), Abu Dhabi’s…",
|
||||||
|
"keywords": [
|
||||||
|
"overseas",
|
||||||
|
"manufacturer",
|
||||||
|
"vietnam",
|
||||||
|
"expected",
|
||||||
|
"billion",
|
||||||
|
"driving",
|
||||||
|
"emirates",
|
||||||
|
"funding",
|
||||||
|
"receive",
|
||||||
|
"least"
|
||||||
|
],
|
||||||
|
"published_date": "30 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Chinese Oil Major to Explore Iraqi Field",
|
||||||
|
"excerpt": "China’s CNOOC has inked a deal for exploration at an oil field in central Iraq, the company said today.\nThe deposit, Block 7, will be…",
|
||||||
|
"keywords": [
|
||||||
|
"deposit",
|
||||||
|
"cnooc",
|
||||||
|
"iraq",
|
||||||
|
"field",
|
||||||
|
"central",
|
||||||
|
"deal",
|
||||||
|
"today",
|
||||||
|
"said",
|
||||||
|
"china",
|
||||||
|
"inked"
|
||||||
|
],
|
||||||
|
"published_date": "30 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "TotalEnergies to Produce More Gas Condensate Offshore Denmark",
|
||||||
|
"excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
|
||||||
|
"keywords": [
|
||||||
|
"phillips",
|
||||||
|
"refining",
|
||||||
|
"giant",
|
||||||
|
"than",
|
||||||
|
"expected",
|
||||||
|
"higher",
|
||||||
|
"year",
|
||||||
|
"plunged",
|
||||||
|
"third",
|
||||||
|
"even"
|
||||||
|
],
|
||||||
|
"published_date": "29 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3",
|
||||||
|
"excerpt": "U.S. refining and chemicals giant Phillips 66 (NYSE: PSX) booked higher-than-expected earnings for the third quarter even if earnings plunged from a year earlier, as…",
|
||||||
|
"keywords": [
|
||||||
|
"phillips",
|
||||||
|
"refining",
|
||||||
|
"giant",
|
||||||
|
"than",
|
||||||
|
"expected",
|
||||||
|
"higher",
|
||||||
|
"year",
|
||||||
|
"plunged",
|
||||||
|
"third",
|
||||||
|
"even"
|
||||||
|
],
|
||||||
|
"published_date": "29 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "UK Offshore Oil Platform Halted Due to Gas Compressor Issue",
|
||||||
|
"excerpt": "Production via the Triton Floating Production Storage & Offloading (FPSO) vessel in the UK North Sea has been halted due to a problem with the…",
|
||||||
|
"keywords": [
|
||||||
|
"fpso",
|
||||||
|
"been",
|
||||||
|
"with",
|
||||||
|
"problem",
|
||||||
|
"halted",
|
||||||
|
"storage",
|
||||||
|
"triton",
|
||||||
|
"vessel",
|
||||||
|
"offloading",
|
||||||
|
"north"
|
||||||
|
],
|
||||||
|
"published_date": "29 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "UAE’s Renewable Energy Giant Pushes Back Green Hydrogen Targets",
|
||||||
|
"excerpt": "Masdar, the clean energy giant of the United Arab Emirates (UAE), has pushed back its target to reach 1 million tons per year of green…",
|
||||||
|
"keywords": [
|
||||||
|
"united",
|
||||||
|
"energy",
|
||||||
|
"giant",
|
||||||
|
"emirates",
|
||||||
|
"back",
|
||||||
|
"year",
|
||||||
|
"million",
|
||||||
|
"arab",
|
||||||
|
"pushed",
|
||||||
|
"target"
|
||||||
|
],
|
||||||
|
"published_date": "28 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Profit at India’s Top Refiner Slumps by 99% Due to Weak Margins",
|
||||||
|
"excerpt": "IndianOil, the biggest refiner in India, reported on Monday a net profit tumbling by 98.6% in the quarter to September from a year ago amid…",
|
||||||
|
"keywords": [
|
||||||
|
"refiner",
|
||||||
|
"monday",
|
||||||
|
"september",
|
||||||
|
"biggest",
|
||||||
|
"reported",
|
||||||
|
"indianoil",
|
||||||
|
"india",
|
||||||
|
"year",
|
||||||
|
"tumbling",
|
||||||
|
"profit"
|
||||||
|
],
|
||||||
|
"published_date": "28 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Average U.S. Gasoline Price Set to Drop Below $3 for the First Time Since 2021",
|
||||||
|
"excerpt": "The U.S. national average price of gasoline is set to soon fall below $3 per gallon for the first time since 2021, amid lower seasonal…",
|
||||||
|
"keywords": [
|
||||||
|
"gasoline",
|
||||||
|
"national",
|
||||||
|
"below",
|
||||||
|
"gallon",
|
||||||
|
"soon",
|
||||||
|
"first",
|
||||||
|
"lower",
|
||||||
|
"average",
|
||||||
|
"seasonal",
|
||||||
|
"price"
|
||||||
|
],
|
||||||
|
"published_date": "28 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "FERC Grants Exxon and Qatar Three-Year Extension to Build Golden Pass LNG",
|
||||||
|
"excerpt": "The U.S. Federal Energy Regulatory Commission has granted a three-year extension to ExxonMobil and QatarEnergy to build their $10-billion Golden Pass LNG export plant in…",
|
||||||
|
"keywords": [
|
||||||
|
"federal",
|
||||||
|
"export",
|
||||||
|
"three",
|
||||||
|
"energy",
|
||||||
|
"golden",
|
||||||
|
"billion",
|
||||||
|
"year",
|
||||||
|
"their",
|
||||||
|
"qatarenergy",
|
||||||
|
"regulatory"
|
||||||
|
],
|
||||||
|
"published_date": "25 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Cepsa: Windfall Tax Would Delay Its $3.3-Billion Hydrogen Plan",
|
||||||
|
"excerpt": "Cepsa, Spain’s second-largest oil company, will delay its $3.25 billion (3 billion euros) investment into domestic green hydrogen projects if Spain makes the windfall tax…",
|
||||||
|
"keywords": [
|
||||||
|
"investment",
|
||||||
|
"second",
|
||||||
|
"projects",
|
||||||
|
"billion",
|
||||||
|
"euros",
|
||||||
|
"largest",
|
||||||
|
"into",
|
||||||
|
"delay",
|
||||||
|
"will",
|
||||||
|
"cepsa"
|
||||||
|
],
|
||||||
|
"published_date": "25 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "South Africa Seeks Loan Guarantees for Energy Transition Funding",
|
||||||
|
"excerpt": "South Africa is currently negotiating loan guarantees with its international partners in its $9.3-billion Just Energy Transition Partnership (JETP) program for energy investment.\nThe International…",
|
||||||
|
"keywords": [
|
||||||
|
"jetp",
|
||||||
|
"negotiating",
|
||||||
|
"energy",
|
||||||
|
"transition",
|
||||||
|
"currently",
|
||||||
|
"investment",
|
||||||
|
"billion",
|
||||||
|
"south",
|
||||||
|
"africa",
|
||||||
|
"guarantees"
|
||||||
|
],
|
||||||
|
"published_date": "25 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Saudi Oil Export Revenues Hit Three-Year Low as Prices Decline",
|
||||||
|
"excerpt": "Lower crude oil prices dragged Saudi Arabia’s oil export revenues to the lowest level in more than three years in August, amid underwhelming oil demand…",
|
||||||
|
"keywords": [
|
||||||
|
"years",
|
||||||
|
"three",
|
||||||
|
"august",
|
||||||
|
"than",
|
||||||
|
"more",
|
||||||
|
"dragged",
|
||||||
|
"revenues",
|
||||||
|
"saudi",
|
||||||
|
"crude",
|
||||||
|
"prices"
|
||||||
|
],
|
||||||
|
"published_date": "24 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Tesla Stock Soars After Q3 Earnings Beat",
|
||||||
|
"excerpt": "Tesla (NASDAQ: TSLA) saw its shares jump by 20% after hours on Wednesday and another 14% in pre-market trade on Thursday after reporting earnings for…",
|
||||||
|
"keywords": [
|
||||||
|
"thursday",
|
||||||
|
"after",
|
||||||
|
"trade",
|
||||||
|
"market",
|
||||||
|
"tesla",
|
||||||
|
"wednesday",
|
||||||
|
"another",
|
||||||
|
"nasdaq",
|
||||||
|
"hours",
|
||||||
|
"reporting"
|
||||||
|
],
|
||||||
|
"published_date": "24 October 2024"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"heading": "Oil Refining Giant Valero Tops Estimates Despite Q3 Profit Plunge",
|
||||||
|
"excerpt": "One of the biggest U.S. refiners, Valero Energy (NYSE: VLO), beat Wall Street estimates even as it reported a widely expected plunge in its third-quarter…",
|
||||||
|
"keywords": [
|
||||||
|
"street",
|
||||||
|
"energy",
|
||||||
|
"biggest",
|
||||||
|
"wall",
|
||||||
|
"reported",
|
||||||
|
"expected",
|
||||||
|
"plunge",
|
||||||
|
"widely",
|
||||||
|
"third",
|
||||||
|
"valero"
|
||||||
|
],
|
||||||
|
"published_date": "24 October 2024"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
109
Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
Normal file
109
Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
|
||||||
|
OUTPUT_FILE = "author_info.json"
|
||||||
|
|
||||||
|
def extract_keywords(text):
|
||||||
|
"""Basic keyword extraction by finding unique words longer than 3 characters."""
|
||||||
|
words = re.findall(r'\b\w{4,}\b', text.lower())
|
||||||
|
keywords = list(set(words))
|
||||||
|
return keywords[:10] # Limit to top 10 unique keywords for simplicity
|
||||||
|
|
||||||
|
def scrape_author_info(author_url, headline_pages=1):
|
||||||
|
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts, keywords, and timestamp."""
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
|
author_name = "Unknown"
|
||||||
|
author_bio = ""
|
||||||
|
contributor_since = ""
|
||||||
|
other_articles = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load author page
|
||||||
|
driver.get(author_url)
|
||||||
|
WebDriverWait(driver, 15).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||||
|
)
|
||||||
|
page_source = driver.page_source
|
||||||
|
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
|
||||||
|
# Extract author name
|
||||||
|
author_name_tag = bio_soup.find('h1')
|
||||||
|
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||||
|
|
||||||
|
# Extract author bio
|
||||||
|
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||||
|
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||||
|
|
||||||
|
# Extract contributor since date
|
||||||
|
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||||
|
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||||
|
|
||||||
|
# Extract latest articles by author with heading, excerpt, keywords, and timestamp
|
||||||
|
for page in range(1, headline_pages + 1):
|
||||||
|
driver.get(f"{author_url}/Page-{page}.html")
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||||
|
)
|
||||||
|
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
article_tags = page_soup.find_all('li', class_='clear')
|
||||||
|
|
||||||
|
for article in article_tags:
|
||||||
|
heading_tag = article.find('h3')
|
||||||
|
excerpt_tag = article.find('p', class_='articlecontent')
|
||||||
|
timestamp_tag = article.find('div', class_='meta')
|
||||||
|
|
||||||
|
if heading_tag and excerpt_tag and timestamp_tag:
|
||||||
|
heading = heading_tag.get_text(strip=True)
|
||||||
|
excerpt = excerpt_tag.get_text(strip=True)
|
||||||
|
timestamp = timestamp_tag.get_text(strip=True).split("|")[0].replace("Published ", "").strip()
|
||||||
|
keywords = extract_keywords(excerpt)
|
||||||
|
|
||||||
|
other_articles.append({
|
||||||
|
"heading": heading,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"keywords": keywords,
|
||||||
|
"published_date": timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error scraping author info: {e}")
|
||||||
|
author_name = "Error Occurred"
|
||||||
|
author_bio = str(e)
|
||||||
|
contributor_since = "N/A"
|
||||||
|
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": [], "published_date": ""}]
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": author_name,
|
||||||
|
"bio": author_bio,
|
||||||
|
"contributor_since": contributor_since,
|
||||||
|
"other_articles": other_articles
|
||||||
|
}
|
||||||
|
|
||||||
|
def save_to_json(data, output_file):
|
||||||
|
"""Save author info to a JSON file."""
|
||||||
|
with open(output_file, mode="w", encoding="utf-8") as file:
|
||||||
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print(f"Author info saved to {output_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Scrape author info
|
||||||
|
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
|
||||||
|
|
||||||
|
# Save to JSON
|
||||||
|
save_to_json(author_info, OUTPUT_FILE)
|
||||||
|
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
|
||||||
|
OUTPUT_FILE = "author_info.json"
|
||||||
|
|
||||||
|
def extract_keywords(text):
|
||||||
|
"""Basic keyword extraction by finding unique words longer than 3 characters."""
|
||||||
|
words = re.findall(r'\b\w{4,}\b', text.lower())
|
||||||
|
keywords = list(set(words))
|
||||||
|
return keywords[:10] # Limit to top 10 unique keywords for simplicity
|
||||||
|
|
||||||
|
def scrape_author_info(author_url, headline_pages=1):
|
||||||
|
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
|
||||||
|
options = Options()
|
||||||
|
options.headless = True
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
|
author_name = "Unknown"
|
||||||
|
author_bio = ""
|
||||||
|
contributor_since = ""
|
||||||
|
other_articles = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load author page
|
||||||
|
driver.get(author_url)
|
||||||
|
WebDriverWait(driver, 15).until(
|
||||||
|
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||||
|
)
|
||||||
|
page_source = driver.page_source
|
||||||
|
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||||
|
|
||||||
|
# Extract author name
|
||||||
|
author_name_tag = bio_soup.find('h1')
|
||||||
|
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||||
|
|
||||||
|
# Extract author bio
|
||||||
|
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||||
|
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||||
|
|
||||||
|
# Extract contributor since date
|
||||||
|
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||||
|
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||||
|
|
||||||
|
# Extract latest articles by author with heading, excerpt, and keywords
|
||||||
|
for page in range(1, headline_pages + 1):
|
||||||
|
driver.get(f"{author_url}/Page-{page}.html")
|
||||||
|
WebDriverWait(driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||||
|
)
|
||||||
|
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||||
|
article_tags = page_soup.find_all('li', class_='clear')
|
||||||
|
|
||||||
|
for article in article_tags:
|
||||||
|
heading_tag = article.find('h3')
|
||||||
|
excerpt_tag = article.find('p', class_='articlecontent')
|
||||||
|
|
||||||
|
if heading_tag and excerpt_tag:
|
||||||
|
heading = heading_tag.get_text(strip=True)
|
||||||
|
excerpt = excerpt_tag.get_text(strip=True)
|
||||||
|
keywords = extract_keywords(excerpt)
|
||||||
|
|
||||||
|
other_articles.append({
|
||||||
|
"heading": heading,
|
||||||
|
"excerpt": excerpt,
|
||||||
|
"keywords": keywords
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error scraping author info: {e}")
|
||||||
|
author_name = "Error Occurred"
|
||||||
|
author_bio = str(e)
|
||||||
|
contributor_since = "N/A"
|
||||||
|
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
|
||||||
|
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": author_name,
|
||||||
|
"bio": author_bio,
|
||||||
|
"contributor_since": contributor_since,
|
||||||
|
"other_articles": other_articles
|
||||||
|
}
|
||||||
|
|
||||||
|
def save_to_json(data, output_file):
|
||||||
|
"""Save author info to a JSON file."""
|
||||||
|
with open(output_file, mode="w", encoding="utf-8") as file:
|
||||||
|
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||||
|
|
||||||
|
print(f"Author info saved to {output_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Scrape author info
|
||||||
|
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
|
||||||
|
|
||||||
|
# Save to JSON
|
||||||
|
save_to_json(author_info, OUTPUT_FILE)
|
||||||
|
|
||||||
Reference in New Issue
Block a user