added author test, about to integrate into preprocessor.py
This commit is contained in:
@@ -0,0 +1,106 @@
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy" # Replace with actual author URL
|
||||
OUTPUT_FILE = "author_info.json"
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Basic keyword extraction by finding unique words longer than 3 characters."""
|
||||
words = re.findall(r'\b\w{4,}\b', text.lower())
|
||||
keywords = list(set(words))
|
||||
return keywords[:10] # Limit to top 10 unique keywords for simplicity
|
||||
|
||||
def scrape_author_info(author_url, headline_pages=1):
|
||||
"""Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
|
||||
options = Options()
|
||||
options.headless = True
|
||||
driver = webdriver.Firefox(options=options)
|
||||
|
||||
author_name = "Unknown"
|
||||
author_bio = ""
|
||||
contributor_since = ""
|
||||
other_articles = []
|
||||
|
||||
try:
|
||||
# Load author page
|
||||
driver.get(author_url)
|
||||
WebDriverWait(driver, 15).until(
|
||||
EC.presence_of_element_located((By.TAG_NAME, "h1"))
|
||||
)
|
||||
page_source = driver.page_source
|
||||
bio_soup = BeautifulSoup(page_source, "html.parser")
|
||||
|
||||
# Extract author name
|
||||
author_name_tag = bio_soup.find('h1')
|
||||
author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"
|
||||
|
||||
# Extract author bio
|
||||
author_bio_tag = bio_soup.find('div', class_='biography')
|
||||
author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"
|
||||
|
||||
# Extract contributor since date
|
||||
contributor_since_tag = bio_soup.find('p', class_='contributor_since')
|
||||
contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"
|
||||
|
||||
# Extract latest articles by author with heading, excerpt, and keywords
|
||||
for page in range(1, headline_pages + 1):
|
||||
driver.get(f"{author_url}/Page-{page}.html")
|
||||
WebDriverWait(driver, 10).until(
|
||||
EC.presence_of_element_located((By.CLASS_NAME, "articles"))
|
||||
)
|
||||
page_soup = BeautifulSoup(driver.page_source, "html.parser")
|
||||
article_tags = page_soup.find_all('li', class_='clear')
|
||||
|
||||
for article in article_tags:
|
||||
heading_tag = article.find('h3')
|
||||
excerpt_tag = article.find('p', class_='articlecontent')
|
||||
|
||||
if heading_tag and excerpt_tag:
|
||||
heading = heading_tag.get_text(strip=True)
|
||||
excerpt = excerpt_tag.get_text(strip=True)
|
||||
keywords = extract_keywords(excerpt)
|
||||
|
||||
other_articles.append({
|
||||
"heading": heading,
|
||||
"excerpt": excerpt,
|
||||
"keywords": keywords
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping author info: {e}")
|
||||
author_name = "Error Occurred"
|
||||
author_bio = str(e)
|
||||
contributor_since = "N/A"
|
||||
other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]
|
||||
|
||||
finally:
|
||||
driver.quit()
|
||||
|
||||
return {
|
||||
"name": author_name,
|
||||
"bio": author_bio,
|
||||
"contributor_since": contributor_since,
|
||||
"other_articles": other_articles
|
||||
}
|
||||
|
||||
def save_to_json(data, output_file):
|
||||
"""Save author info to a JSON file."""
|
||||
with open(output_file, mode="w", encoding="utf-8") as file:
|
||||
json.dump(data, file, ensure_ascii=False, indent=4)
|
||||
|
||||
print(f"Author info saved to {output_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Scrape author info
|
||||
author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)
|
||||
|
||||
# Save to JSON
|
||||
save_to_json(author_info, OUTPUT_FILE)
|
||||
|
||||
Reference in New Issue
Block a user