added logging for duplicate articles, added placeholder for sentiment analysis
This commit is contained in:
@@ -1 +1,21 @@
|
||||
|
||||
headline,link,date
|
||||
Hess Beats Q3 Earnings Estimates On Robust Guyana Output,https://oilprice.com/Latest-Energy-News/World-News/Hess-Beats-Q3-Earnings-Estimates-On-Robust-Guyana-Output.html,"Oct 30, 2024 at 13:13 | Alex Kimani"
|
||||
U.S. Governors Demand Power Price Overhaul As Costs Balloon 10 Fold,https://oilprice.com/Latest-Energy-News/World-News/US-Governors-Demand-Power-Price-Overhaul-As-Costs-Balloon-10-Fold.html,"Oct 30, 2024 at 12:10 | Julianne Geiger"
|
||||
Russia’s Gazprom Boosts 2024 Investments to $16.9 Billion,https://oilprice.com/Latest-Energy-News/World-News/Russias-Gazprom-Boosts-2024-Investments-to-169-Billion.html,"Oct 30, 2024 at 10:44 | Charles Kennedy"
|
||||
Investment Giants Form $50-Billion AI and Power Partnership,https://oilprice.com/Latest-Energy-News/World-News/Investment-Giants-Form-50-Billion-AI-and-Power-Partnership.html,"Oct 30, 2024 at 09:20 | Charles Kennedy"
|
||||
Vietnamese EV Maker Gets $1 Billion in Funding Led by UAE,https://oilprice.com/Latest-Energy-News/World-News/Vietnamese-EV-Maker-Gets-1-Billion-in-Funding-Led-by-UAE.html,"Oct 30, 2024 at 08:55 | Charles Kennedy"
|
||||
The West Needs Incentives to Cut Russian Nuclear Fuel Dependence,https://oilprice.com/Latest-Energy-News/World-News/The-West-Needs-Incentives-to-Cut-Russian-Nuclear-Fuel-Dependence.html,"Oct 30, 2024 at 08:03 | Tsvetana Paraskova"
|
||||
Gazprom Unit Sues Industrial Gases Giant Linde for $884 Million,https://oilprice.com/Latest-Energy-News/World-News/Gazprom-Unit-Sues-Industrial-Gases-Giant-Linde-for-884-Million.html,"Oct 30, 2024 at 07:33 | Tsvetana Paraskova"
|
||||
Chinese Oil Major to Explore Iraqi Field,https://oilprice.com/Latest-Energy-News/World-News/Chinese-Oil-Major-to-Explore-Iraqi-Field.html,"Oct 30, 2024 at 06:09 | Charles Kennedy"
|
||||
Oil Prices Remain Subdued on the Prospect of an Israel-Lebanon Ceasefire,https://oilprice.com/Latest-Energy-News/World-News/Oil-Prices-Remain-Subdued-on-Prospect-of-Israel-Lebanon-Ceasefire.html,"Oct 30, 2024 at 04:55 | Irina Slav"
|
||||
Ukraine and Russia Discuss Halting Attacks on Energy Sites,https://oilprice.com/Latest-Energy-News/World-News/Ukraine-and-Russia-Discuss-Halting-Attacks-on-Energy-Sites.html,"Oct 30, 2024 at 04:05 | Tsvetana Paraskova"
|
||||
Lukoil’s Trading Arm Looks to Revive U.S. Business,https://oilprice.com/Latest-Energy-News/World-News/Lukoils-Trading-Arm-Looks-to-Revive-US-Business.html,"Oct 30, 2024 at 03:08 | Tsvetana Paraskova"
|
||||
"Unexpected Crude, Product Draws Send Oil Prices Up",https://oilprice.com/Latest-Energy-News/World-News/Unexpected-Crude-Product-Draws-Send-Oil-Prices-Up.html,"Oct 29, 2024 at 15:51 | Julianne Geiger"
|
||||
"U.S. To Buy 3 Million Barrels for The SPR, But There’s A Problem",https://oilprice.com/Latest-Energy-News/World-News/US-To-Buy-3-Million-Barrels-for-The-SPR-But-Theres-A-Problem.html,"Oct 29, 2024 at 13:56 | Alex Kimani"
|
||||
"As Oil Job Losses Mount, Steelworkers Union Looks to Clean Energy",https://oilprice.com/Latest-Energy-News/World-News/As-Oil-Job-Losses-Mount-Steelworkers-Union-Looks-to-Clean-Energy.html,"Oct 29, 2024 at 13:05 | Alex Kimani"
|
||||
TotalEnergies to Produce More Gas Condensate Offshore Denmark,https://oilprice.com/Latest-Energy-News/World-News/TotalEnergies-to-Produce-More-Gas-Condensate-Offshore-Denmark.html,"Oct 29, 2024 at 10:59 | Charles Kennedy"
|
||||
Phillips 66 Beats Analyst Estimates Despite Earnings Dip in Q3,https://oilprice.com/Latest-Energy-News/World-News/Phillips-66-Beats-Analyst-Estimates-Despite-Earnings-Dip-in-Q3.html,"Oct 29, 2024 at 09:52 | Charles Kennedy"
|
||||
UK Offshore Oil Platform Halted Due to Gas Compressor Issue,https://oilprice.com/Latest-Energy-News/World-News/UK-Offshore-Oil-Platform-Halted-Due-to-Gas-Compressor-Issue.html,"Oct 29, 2024 at 09:12 | Charles Kennedy"
|
||||
Nigeria Discusses Crude and Fuel Supply with Africa’s Top Refinery,https://oilprice.com/Latest-Energy-News/World-News/Nigeria-Discusses-Crude-and-Fuel-Supply-with-Africas-Top-Refinery.html,"Oct 29, 2024 at 07:56 | Tsvetana Paraskova"
|
||||
Austria’s OMV Profit Slumps on Weak Oil Trading and Refining,https://oilprice.com/Latest-Energy-News/World-News/Austrias-OMV-Profit-Slumps-on-Weak-Oil-Trading-and-Refining.html,"Oct 29, 2024 at 07:06 | Tsvetana Paraskova"
|
||||
BP Earnings Top Forecasts Despite Weaker Oil Prices and Refining,https://oilprice.com/Latest-Energy-News/World-News/BP-Earnings-Top-Forecasts-Despite-Weaker-Oil-Prices-and-Refining.html,"Oct 29, 2024 at 06:03 | Tsvetana Paraskova"
|
||||
|
||||
|
4002
Data-Collection/WebScraper/data/oil_news.json
Normal file
4002
Data-Collection/WebScraper/data/oil_news.json
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@@ -26,7 +26,13 @@ def save_to_json(data, file_path):
|
||||
existing_data = load_existing_data(file_path)
|
||||
existing_links = {article['link'] for article in existing_data}
|
||||
|
||||
new_data = [article for article in data if article['link'] not in existing_links]
|
||||
new_data = []
|
||||
for article in data:
|
||||
if article['link'] in existing_links:
|
||||
print(f"Skipping duplicate article: {article['headline']}")
|
||||
continue
|
||||
new_data.append(article)
|
||||
|
||||
combined_data = existing_data + new_data
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
@@ -34,9 +40,20 @@ def save_to_json(data, file_path):
|
||||
print(f"Oil news data saved to {file_path}")
|
||||
|
||||
def extract_keywords(text):
|
||||
"""Simple function to extract keywords from text."""
|
||||
keywords = re.findall(r'\b\w+\b', text.lower())
|
||||
return list(set(keywords))[:10] # Return the first 10 unique keywords
|
||||
"""Improved placeholder function to extract keywords from text."""
|
||||
words = re.findall(r'\b\w+\b', text.lower())
|
||||
keywords = [word for word in words if len(word) > 3] # Example filter: words longer than 3 chars
|
||||
return list(set(keywords))[:10] # Return up to 10 unique keywords
|
||||
|
||||
def analyze_sentiment(text):
|
||||
"""Placeholder function for sentiment analysis."""
|
||||
# Basic placeholder logic (to be replaced with actual sentiment analysis)
|
||||
if "profit" in text or "rise" in text:
|
||||
return "Positive"
|
||||
elif "loss" in text or "decline" in text:
|
||||
return "Negative"
|
||||
else:
|
||||
return "Neutral"
|
||||
|
||||
def scrape_oil_news():
|
||||
print("Scraping oil market news using Selenium...")
|
||||
@@ -50,7 +67,6 @@ def scrape_oil_news():
|
||||
max_pages = 10 # Limit to 10 pages
|
||||
|
||||
while page_number <= max_pages:
|
||||
# Load the page with pagination
|
||||
driver.get(f"{OIL_NEWS_URL}Page-{page_number}.html")
|
||||
|
||||
try:
|
||||
@@ -84,7 +100,7 @@ def scrape_oil_news():
|
||||
'author': author,
|
||||
'excerpt': excerpt,
|
||||
'keywords': extract_keywords(headline + " " + excerpt if excerpt else headline),
|
||||
'sentiment_analysis': None # Placeholder for future sentiment analysis
|
||||
'sentiment_analysis': analyze_sentiment(headline + " " + excerpt if excerpt else headline)
|
||||
})
|
||||
|
||||
page_number += 1
|
||||
|
||||
Reference in New Issue
Block a user