fixed some bs

This commit is contained in:
klein panic
2024-10-27 18:03:01 -04:00
parent 8d9ed2e7ea
commit a81e38e21f
5 changed files with 103 additions and 28 deletions

View File

@@ -21,6 +21,9 @@ def scrape_oil_news():
response = requests.get(OIL_NEWS_URL)
response.raise_for_status()
# Print the HTML to see what we are working with
print(response.text[:1000]) # Print only the first 1000 characters for brevity
# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
@@ -45,9 +48,7 @@ def scrape_oil_news():
'date': date
})
# Convert the list into a pandas DataFrame
df = pd.DataFrame(news_data)
return df
# Function to run the scraper and save data

View File

@@ -0,0 +1,26 @@
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
import time
# Provide the path to your geckodriver executable using the Service class
service = Service(executable_path='/usr/local/bin/geckodriver')
driver = webdriver.Firefox(service=service)
# Open a website (e.g., OilPrice.com)
driver.get("https://oilprice.com/Latest-Energy-News/World-News/")
# Wait for the page to load
time.sleep(5)
# Print the title of the page to verify that it's loaded
print(driver.title)
# Find and print some element on the page, e.g., all article titles
articles = driver.find_elements(By.CSS_SELECTOR, "div.categoryArticle")
for article in articles:
title = article.find_element(By.TAG_NAME, "a").text
print(f"Article title: {title}")
# Close the browser
driver.quit()