Add RSS feed support to BalanceBoard

- Implement RSS/Atom feed parser using feedparser library
- Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS)
- Support both RSS 2.0 and Atom formats with automatic detection
- Extract and normalize: title, author, link, content, tags, timestamps
- HTML entity unescaping and tag stripping for clean content
- Fallback handling for missing fields
- Users can add any RSS feed URL as a collection source

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-11 17:20:52 -05:00
parent e821a26b48
commit 47cca9d45e
3 changed files with 384 additions and 0 deletions

View File

@@ -80,6 +80,8 @@ class data_methods():
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
elif platform == "stackexchange":
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
elif platform == "rss":
return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
else:
print("dataGrab.getData: platform not recognized")
return None
@@ -263,6 +265,11 @@ class data_methods():
'order': 'desc'
}
@staticmethod
def build_rss_url(feed_url):
"""RSS feeds use the URL directly as provided in config"""
return feed_url
# ===== SCHEMA CONVERTERS =====
class converters():
"""Functions to convert platform-specific data to unified schema"""
@@ -340,6 +347,81 @@ class data_methods():
'meta': {'view_count': raw.get('view_count', 0)}
}
@staticmethod
def rss_to_schema(entry, feed_url):
"""
Convert RSS/Atom feed entry to unified schema.
Supports both RSS 2.0 and Atom formats via feedparser.
"""
import hashlib
from html import unescape
# Extract link (RSS uses 'link', Atom may use 'links')
link = entry.get('link', '')
if not link and 'links' in entry and len(entry['links']) > 0:
link = entry['links'][0].get('href', '')
# Generate ID from link or guid
entry_id = entry.get('id', entry.get('guid', link))
if not entry_id:
# Fallback: hash of title + link
entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
# Clean up ID to make it URL-safe
safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
# Extract timestamp
timestamp = 0
if 'published_parsed' in entry and entry['published_parsed']:
import time
timestamp = int(time.mktime(entry['published_parsed']))
elif 'updated_parsed' in entry and entry['updated_parsed']:
import time
timestamp = int(time.mktime(entry['updated_parsed']))
# Extract author
author = 'unknown'
if 'author' in entry:
author = entry['author']
elif 'author_detail' in entry:
author = entry['author_detail'].get('name', 'unknown')
# Extract content (try summary, then description, then content)
content = ''
if 'summary' in entry:
content = unescape(entry['summary'])
elif 'description' in entry:
content = unescape(entry['description'])
elif 'content' in entry and len(entry['content']) > 0:
content = unescape(entry['content'][0].get('value', ''))
# Strip HTML tags for cleaner content
import re
content = re.sub(r'<[^>]+>', '', content)
# Extract tags/categories
tags = []
if 'tags' in entry:
tags = [tag.get('term', '') for tag in entry['tags']]
return {
'platform': 'rss',
'id': f"rss_{safe_id}",
'title': unescape(entry.get('title', 'Untitled')),
'author': author,
'timestamp': timestamp,
'score': 0, # RSS doesn't have scores
'replies': 0, # RSS doesn't track comments
'url': link,
'content': content[:1000], # Limit content length
'source': feed_url,
'tags': tags,
'meta': {
'feed_url': feed_url,
'guid': entry.get('guid', '')
}
}
# ===== COMMENT FETCHERS =====
class comment_fetchers():
"""Functions to fetch comments for posts from various platforms"""
@@ -621,3 +703,42 @@ class data_methods():
# Fetch and convert
raw = data_methods.utils.http_get_json(url, params=params)
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
@staticmethod
def getRSSData(start_date, end_date, feed_url, max_posts):
"""
Fetch and parse RSS/Atom feeds.
Requires feedparser library: pip install feedparser
"""
try:
import feedparser
except ImportError:
print("Error: feedparser not installed. Run: pip install feedparser")
return []
try:
# Fetch and parse RSS feed
feed = feedparser.parse(feed_url)
# Check for errors
if hasattr(feed, 'bozo') and feed.bozo:
print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
# Extract entries
entries = feed.get('entries', [])[:max_posts]
if not entries:
print(f"No entries found in RSS feed: {feed_url}")
return []
# Convert entries to unified schema
posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
# Filter by date range
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
except Exception as e:
print(f"Error fetching RSS feed {feed_url}: {e}")
import traceback
traceback.print_exc()
return []