From 7f47d4483d563162eec46d14a68f84903dd32de3 Mon Sep 17 00:00:00 2001 From: klein panic Date: Tue, 22 Oct 2024 14:47:39 -0400 Subject: [PATCH] Added data-collection --- Data-Collection/.gitignore | 1 + Data-Collection/WebScraper/README.md | 0 Data-Collection/WebScraper/data/oil_news.csv | 1 + Data-Collection/WebScraper/main.py | 14 ++++ .../oil_news_scraper.cpython-311.pyc | Bin 0 -> 2897 bytes .../WebScraper/scrapers/oil_news_scraper.py | 66 ++++++++++++++++++ Data-Collection/WebScraper/setup.py | 0 7 files changed, 82 insertions(+) create mode 100644 Data-Collection/.gitignore create mode 100644 Data-Collection/WebScraper/README.md create mode 100644 Data-Collection/WebScraper/data/oil_news.csv create mode 100644 Data-Collection/WebScraper/main.py create mode 100644 Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc create mode 100644 Data-Collection/WebScraper/scrapers/oil_news_scraper.py create mode 100644 Data-Collection/WebScraper/setup.py diff --git a/Data-Collection/.gitignore b/Data-Collection/.gitignore new file mode 100644 index 0000000..f7275bb --- /dev/null +++ b/Data-Collection/.gitignore @@ -0,0 +1 @@ +venv/ diff --git a/Data-Collection/WebScraper/README.md b/Data-Collection/WebScraper/README.md new file mode 100644 index 0000000..e69de29 diff --git a/Data-Collection/WebScraper/data/oil_news.csv b/Data-Collection/WebScraper/data/oil_news.csv new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/Data-Collection/WebScraper/data/oil_news.csv @@ -0,0 +1 @@ + diff --git a/Data-Collection/WebScraper/main.py b/Data-Collection/WebScraper/main.py new file mode 100644 index 0000000..cb5ad05 --- /dev/null +++ b/Data-Collection/WebScraper/main.py @@ -0,0 +1,14 @@ +# main.py +import scrapers.oil_news_scraper as oil_news + +def main(): + print("Starting oil data collection...") + + # Run oil market news scraper + oil_news.run_scraper() + + print("Oil news data scraping completed.") + +if __name__ == "__main__": + main() + diff --git a/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc b/Data-Collection/WebScraper/scrapers/__pycache__/oil_news_scraper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cfde48a9da90ad7549cc6af768035330d1ba6329 GIT binary patch literal 2897 zcmZ3^%ge>Uz`*eRgLt|P2Lr=n5C?`?pp4J=7#J9)Go&!2Fy=5sfoP^2#$2W-CPomO zF^4&d1uVxJ#hSt##g@vN#SYcUuz&-`LZ+D*QdzS&VIs>I7#LQ=I1DK)%a|A#Rx`l_ zqPS4Z)0FF}g^ zG#PL4I;AF-mSm=t<^<=L7E~E$l#~<{Tj}fPXXX?XWhSTUCFker`y`g67MJL{=A{;; zSL*tumKW=X=NILqK$y2!QW8rNlR+lJFepTr85kHo?_mOkWeMD^3=AoZsIF&ZC;_R2 z>dbAx=52)l?&i$1>^!4PGMiefoftN zV+unpE3u{)NfWCwg;Nq{DFfIJHk>-SQn=AWr%0O=)g<|qy@s_&p@ywUu79%p8T>#G>rf5{10f@?t$by(;dElH44&V$ zQq%K`DjkbTGLv&sH5qTQCFdj-7suaXObpRvyv15vQj}S6izTBdHSHElaY16+*m9#L^DT~y)WnpW%)HcFEIFBZ+29PAs>yqcwV)_7ujCd-QEFjnYH>;N zE#~ypl3P6fo<8w@uHnJ)p+P>k1d0+fi&Nv%@{8h&OA<>;i;FSXjxC&XQd{W6zltDrX&`Jq$X$N<>%z5XQmeGgIuEPoS&1Enp~2Z zpQj(5ngoum)FS<2P^_gE6@!Yhc(7BTB675_4o%@LuFLyTWbO;PQcuK|rLxwySnV=ml+?j@pa-wpaLV8$3R+F$fAb zcz$4G5Rv)7z{AQ9COX(3*t+bG*%fm_`J%1g6B|Wzw>o|pG*8c7lOkt@cUfkkGR4gae+VL0|P@iQ#4Zt%Lg_F z9+3|WtepHaSisZ*2=4}u=ye{MOFS|+WK?cQn64ceT@cazQxhORas2%_%;Lq>CwKj51 zgId$pGL*pU76t}HJ+lC$8)PgBEvwZ153bq$;D-Epu#0HFD132 zO58s)2V6!efa+$2;>5Dl6ory}g(_{oe27qaVzB}w6Qt-Vgk+>DI0uI*q-Ex$Dr6Qb zq~;csRO)GRfV0Idru^btECq=r8Mj!n@-y>pakw~!IL5np2Hj!>E4;;4k{_R3Tvi0i zK$=Xq*ul9iCG8d`$h>%v&SFs5D<~8xgK|B0QE6U0#O|UZ0|o{LGf>8>U;t&jr+l&( z`Q)$g$v3!v1!a>d?9+KB@yv}_PlVCX4G()uEY%@q!t3oulq%D}*& z$zLP{avpP1u?eJ5WGhI_OGzv)k_Hu68X!UhL}-EtEs#>S^wg5%@{}TNFekMl6IAGO zH@&b$G2R0=} Uz7Gs=!i$lI(e?uaCIR*k0JJWg;Q#;t literal 0 HcmV?d00001 diff --git a/Data-Collection/WebScraper/scrapers/oil_news_scraper.py b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py new file mode 100644 index 0000000..e0f4f31 --- /dev/null +++ b/Data-Collection/WebScraper/scrapers/oil_news_scraper.py @@ -0,0 +1,66 @@ +# scrapers/oil_news_scraper.py + +import requests +from bs4 import BeautifulSoup +import pandas as pd +import os + +# URL for OilPrice.com homepage +OIL_NEWS_URL = "https://oilprice.com/Latest-Energy-News/World-News/" + +# Define the directory to store the scraped data +DATA_DIR = os.path.join(os.getcwd(), "data") +if not os.path.exists(DATA_DIR): + os.makedirs(DATA_DIR) + +# Function to scrape news headlines from OilPrice.com +def scrape_oil_news(): + print("Scraping oil market news...") + + # Send an HTTP request to the website + response = requests.get(OIL_NEWS_URL) + response.raise_for_status() + + # Parse the HTML using BeautifulSoup + soup = BeautifulSoup(response.text, "html.parser") + + # Find all news article containers (class names updated) + articles = soup.find_all('div', class_='categoryArticle') + + # List to store the scraped data + news_data = [] + + # Loop through each article container + for article in articles: + # Extract the headline, date, and link + headline = article.find('a').get_text(strip=True) if article.find('a') else None + link = article.find('a')['href'] if article.find('a') else None + date = article.find('span', class_='categoryArticle__date').get_text(strip=True) if article.find('span', class_='categoryArticle__date') else None + + # Only append valid data + if headline and link and date: + news_data.append({ + 'headline': headline, + 'link': f"https://oilprice.com{link}", + 'date': date + }) + + # Convert the list into a pandas DataFrame + df = pd.DataFrame(news_data) + + return df + +# Function to run the scraper and save data +def run_scraper(): + # Scrape oil news + news_df = scrape_oil_news() + + # Define the file path for saving the data + file_path = os.path.join(DATA_DIR, 'oil_news.csv') + + # Save the DataFrame to a CSV file + if not news_df.empty: + news_df.to_csv(file_path, index=False) + print(f"Oil news data saved to {file_path}") + else: + print("No data was scraped. The CSV file is empty.") diff --git a/Data-Collection/WebScraper/setup.py b/Data-Collection/WebScraper/setup.py new file mode 100644 index 0000000..e69de29