Add RSS feed support to BalanceBoard
- Implement RSS/Atom feed parser using feedparser library - Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS) - Support both RSS 2.0 and Atom formats with automatic detection - Extract and normalize: title, author, link, content, tags, timestamps - HTML entity unescaping and tag stripping for clean content - Fallback handling for missing fields - Users can add any RSS feed URL as a collection source 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,8 @@ class data_methods():
|
||||
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
|
||||
elif platform == "stackexchange":
|
||||
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
|
||||
elif platform == "rss":
|
||||
return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
|
||||
else:
|
||||
print("dataGrab.getData: platform not recognized")
|
||||
return None
|
||||
@@ -263,6 +265,11 @@ class data_methods():
|
||||
'order': 'desc'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def build_rss_url(feed_url):
|
||||
"""RSS feeds use the URL directly as provided in config"""
|
||||
return feed_url
|
||||
|
||||
# ===== SCHEMA CONVERTERS =====
|
||||
class converters():
|
||||
"""Functions to convert platform-specific data to unified schema"""
|
||||
@@ -340,6 +347,81 @@ class data_methods():
|
||||
'meta': {'view_count': raw.get('view_count', 0)}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def rss_to_schema(entry, feed_url):
|
||||
"""
|
||||
Convert RSS/Atom feed entry to unified schema.
|
||||
Supports both RSS 2.0 and Atom formats via feedparser.
|
||||
"""
|
||||
import hashlib
|
||||
from html import unescape
|
||||
|
||||
# Extract link (RSS uses 'link', Atom may use 'links')
|
||||
link = entry.get('link', '')
|
||||
if not link and 'links' in entry and len(entry['links']) > 0:
|
||||
link = entry['links'][0].get('href', '')
|
||||
|
||||
# Generate ID from link or guid
|
||||
entry_id = entry.get('id', entry.get('guid', link))
|
||||
if not entry_id:
|
||||
# Fallback: hash of title + link
|
||||
entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
|
||||
|
||||
# Clean up ID to make it URL-safe
|
||||
safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
|
||||
|
||||
# Extract timestamp
|
||||
timestamp = 0
|
||||
if 'published_parsed' in entry and entry['published_parsed']:
|
||||
import time
|
||||
timestamp = int(time.mktime(entry['published_parsed']))
|
||||
elif 'updated_parsed' in entry and entry['updated_parsed']:
|
||||
import time
|
||||
timestamp = int(time.mktime(entry['updated_parsed']))
|
||||
|
||||
# Extract author
|
||||
author = 'unknown'
|
||||
if 'author' in entry:
|
||||
author = entry['author']
|
||||
elif 'author_detail' in entry:
|
||||
author = entry['author_detail'].get('name', 'unknown')
|
||||
|
||||
# Extract content (try summary, then description, then content)
|
||||
content = ''
|
||||
if 'summary' in entry:
|
||||
content = unescape(entry['summary'])
|
||||
elif 'description' in entry:
|
||||
content = unescape(entry['description'])
|
||||
elif 'content' in entry and len(entry['content']) > 0:
|
||||
content = unescape(entry['content'][0].get('value', ''))
|
||||
|
||||
# Strip HTML tags for cleaner content
|
||||
import re
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
|
||||
# Extract tags/categories
|
||||
tags = []
|
||||
if 'tags' in entry:
|
||||
tags = [tag.get('term', '') for tag in entry['tags']]
|
||||
|
||||
return {
|
||||
'platform': 'rss',
|
||||
'id': f"rss_{safe_id}",
|
||||
'title': unescape(entry.get('title', 'Untitled')),
|
||||
'author': author,
|
||||
'timestamp': timestamp,
|
||||
'score': 0, # RSS doesn't have scores
|
||||
'replies': 0, # RSS doesn't track comments
|
||||
'url': link,
|
||||
'content': content[:1000], # Limit content length
|
||||
'source': feed_url,
|
||||
'tags': tags,
|
||||
'meta': {
|
||||
'feed_url': feed_url,
|
||||
'guid': entry.get('guid', '')
|
||||
}
|
||||
}
|
||||
|
||||
# ===== COMMENT FETCHERS =====
|
||||
class comment_fetchers():
|
||||
"""Functions to fetch comments for posts from various platforms"""
|
||||
@@ -621,3 +703,42 @@ class data_methods():
|
||||
# Fetch and convert
|
||||
raw = data_methods.utils.http_get_json(url, params=params)
|
||||
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
|
||||
|
||||
@staticmethod
|
||||
def getRSSData(start_date, end_date, feed_url, max_posts):
|
||||
"""
|
||||
Fetch and parse RSS/Atom feeds.
|
||||
Requires feedparser library: pip install feedparser
|
||||
"""
|
||||
try:
|
||||
import feedparser
|
||||
except ImportError:
|
||||
print("Error: feedparser not installed. Run: pip install feedparser")
|
||||
return []
|
||||
|
||||
try:
|
||||
# Fetch and parse RSS feed
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
# Check for errors
|
||||
if hasattr(feed, 'bozo') and feed.bozo:
|
||||
print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
|
||||
|
||||
# Extract entries
|
||||
entries = feed.get('entries', [])[:max_posts]
|
||||
|
||||
if not entries:
|
||||
print(f"No entries found in RSS feed: {feed_url}")
|
||||
return []
|
||||
|
||||
# Convert entries to unified schema
|
||||
posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
|
||||
|
||||
# Filter by date range
|
||||
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching RSS feed {feed_url}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user