fixed some bs

2024-10-27 18:03:01 -04:00
parent 8d9ed2e7ea
commit a81e38e21f
5 changed files with 103 additions and 28 deletions
--- a/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
+++ b/Data-Collection/WebScraper/scrapers/pycache/oil_news_scraper.cpython-311.pyc
--- a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
+++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py
@@ -21,6 +21,9 @@ def scrape_oil_news():
    response = requests.get(OIL_NEWS_URL)
    response.raise_for_status()

+    # Print the HTML to see what we are working with
+    print(response.text[:1000])  # Print only the first 1000 characters for brevity
+
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

@@ -45,9 +48,7 @@ def scrape_oil_news():
                'date': date
            })

-    # Convert the list into a pandas DataFrame
    df = pd.DataFrame(news_data)
-
    return df

 # Function to run the scraper and save data
--- a/Data-Collection/WebScraper/scrapers/tests/selenium_webdriver_test.py
+++ b/Data-Collection/WebScraper/scrapers/tests/selenium_webdriver_test.py
@@ -0,0 +1,26 @@
+from selenium import webdriver
+from selenium.webdriver.firefox.service import Service
+from selenium.webdriver.common.by import By
+import time
+
+# Provide the path to your geckodriver executable using the Service class
+service = Service(executable_path='/usr/local/bin/geckodriver')
+driver = webdriver.Firefox(service=service)
+
+# Open a website (e.g., OilPrice.com)
+driver.get("https://oilprice.com/Latest-Energy-News/World-News/")
+
+# Wait for the page to load
+time.sleep(5)
+
+# Print the title of the page to verify that it's loaded
+print(driver.title)
+
+# Find and print some element on the page, e.g., all article titles
+articles = driver.find_elements(By.CSS_SELECTOR, "div.categoryArticle")
+for article in articles:
+    title = article.find_element(By.TAG_NAME, "a").text
+    print(f"Article title: {title}")
+
+# Close the browser
+driver.quit()