MidasEngine/Data-Collection/WebScraper/scrapers/tests/author_scraper_test.py.bak

import json
import re
import time
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

AUTHOR_URL = "https://oilprice.com/contributors/Charles-Kennedy"  # Replace with actual author URL
OUTPUT_FILE = "author_info.json"

def extract_keywords(text):
    """Basic keyword extraction by finding unique words longer than 3 characters."""
    words = re.findall(r'\b\w{4,}\b', text.lower())
    keywords = list(set(words))
    return keywords[:10]  # Limit to top 10 unique keywords for simplicity

def scrape_author_info(author_url, headline_pages=1):
    """Scrape author's name, bio, contributor since date, and latest article headlines with excerpts and keywords."""
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)

    author_name = "Unknown"
    author_bio = ""
    contributor_since = ""
    other_articles = []

    try:
        # Load author page
        driver.get(author_url)
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )
        page_source = driver.page_source
        bio_soup = BeautifulSoup(page_source, "html.parser")

        # Extract author name
        author_name_tag = bio_soup.find('h1')
        author_name = author_name_tag.get_text(strip=True) if author_name_tag else "Unknown Author"

        # Extract author bio
        author_bio_tag = bio_soup.find('div', class_='biography')
        author_bio = author_bio_tag.get_text(strip=True) if author_bio_tag else "No bio available"

        # Extract contributor since date
        contributor_since_tag = bio_soup.find('p', class_='contributor_since')
        contributor_since = contributor_since_tag.get_text(strip=True).replace("Contributor since: ", "") if contributor_since_tag else "Unknown Date"

        # Extract latest articles by author with heading, excerpt, and keywords
        for page in range(1, headline_pages + 1):
            driver.get(f"{author_url}/Page-{page}.html")
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "articles"))
            )
            page_soup = BeautifulSoup(driver.page_source, "html.parser")
            article_tags = page_soup.find_all('li', class_='clear')

            for article in article_tags:
                heading_tag = article.find('h3')
                excerpt_tag = article.find('p', class_='articlecontent')

                if heading_tag and excerpt_tag:
                    heading = heading_tag.get_text(strip=True)
                    excerpt = excerpt_tag.get_text(strip=True)
                    keywords = extract_keywords(excerpt)

                    other_articles.append({
                        "heading": heading,
                        "excerpt": excerpt,
                        "keywords": keywords
                    })

    except Exception as e:
        print(f"Error scraping author info: {e}")
        author_name = "Error Occurred"
        author_bio = str(e)
        contributor_since = "N/A"
        other_articles = [{"heading": "Error retrieving articles", "excerpt": "", "keywords": []}]

    finally:
        driver.quit()

    return {
        "name": author_name,
        "bio": author_bio,
        "contributor_since": contributor_since,
        "other_articles": other_articles
    }

def save_to_json(data, output_file):
    """Save author info to a JSON file."""
    with open(output_file, mode="w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

    print(f"Author info saved to {output_file}")

if __name__ == "__main__":
    # Scrape author info
    author_info = scrape_author_info(AUTHOR_URL, headline_pages=1)

    # Save to JSON
    save_to_json(author_info, OUTPUT_FILE)