Files
MidasEngine/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak

107 lines
4.0 KiB
Python

import json
import re
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
OUTPUT_FILE = "author_info.json"
def extract_keywords(text):
"""Basic keyword extraction by finding unique words longer than 3 characters."""
words = re.findall(r'\b\w{4,}\b', text.lower())
keywords = list(set(words))
return keywords[:10] # Limit to top 10 unique keywords for simplicity
def scrape_author_info(author_url, headline_pages=1):
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
author_name = "Unknown"
author_bio = ""
contributor_since = ""
other_articles = []
try:
# Load author page
driver.get(author_url)
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.TAG_NAME, "h1"))
)
page_source = driver.page_source
bio_soup = BeautifulSoup(page_source, "html.parser")
# Extract author name
author_name_tag = bio_soup.find('h1')
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
# Extract author bio
author_bio_tag = bio_soup.find('div', class_='biography')
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
# Extract contributor since date
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
# Extract latest articles by author with heading, excerpt, and keywords
for page in range(1, headline_pages + 1):
driver.get(f"{author_url}/Page-{page}.html")
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
)
page_soup = BeautifulSoup(driver.page_source, "html.parser")
article_tags = page_soup.find_all('li', class_='clear')
for article in article_tags:
heading_tag = article.find('h3')
excerpt_tag = article.find('p', class_='articlecontent')
if heading_tag and excerpt_tag:
heading = heading_tag.get_text(strip=True)
excerpt = excerpt_tag.get_text(strip=True)
keywords = extract_keywords(excerpt)
other_articles.append({
"heading": heading,
"excerpt": excerpt,
"keywords": keywords
})
except Exception as e:
print(f"Error scraping author info: {e}")
author_name = "Error Occurred"
author_bio = str(e)
contributor_since = "N/A"
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
finally:
driver.quit()
return {
"name": author_name,
"bio": author_bio,
"contributor_since": contributor_since,
"other_articles": other_articles
}
def save_to_json(data, output_file):
"""Save author info to a JSON file."""
with open(output_file, mode="w", encoding="utf-8") as file:
json.dump(data, file, ensure_ascii=False, indent=4)
print(f"Author info saved to {output_file}")
if __name__ == "__main__":
# Scrape author info
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
# Save to JSON
save_to_json(author_info, OUTPUT_FILE)