Add RSS feed support to BalanceBoard

- Implement RSS/Atom feed parser using feedparser library
- Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS)
- Support both RSS 2.0 and Atom formats with automatic detection
- Extract and normalize: title, author, link, content, tags, timestamps
- HTML entity unescaping and tag stripping for clean content
- Fallback handling for missing fields
- Users can add any RSS feed URL as a collection source

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-11 17:20:52 -05:00
parent e821a26b48
commit 47cca9d45e
3 changed files with 384 additions and 0 deletions

View File

@@ -80,6 +80,8 @@ class data_methods():
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
elif platform == "stackexchange":
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
elif platform == "rss":
return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
else:
print("dataGrab.getData: platform not recognized")
return None
@@ -263,6 +265,11 @@ class data_methods():
'order': 'desc'
}
@staticmethod
def build_rss_url(feed_url):
"""RSS feeds use the URL directly as provided in config"""
return feed_url
# ===== SCHEMA CONVERTERS =====
class converters():
"""Functions to convert platform-specific data to unified schema"""
@@ -340,6 +347,81 @@ class data_methods():
'meta': {'view_count': raw.get('view_count', 0)}
}
@staticmethod
def rss_to_schema(entry, feed_url):
"""
Convert RSS/Atom feed entry to unified schema.
Supports both RSS 2.0 and Atom formats via feedparser.
"""
import hashlib
from html import unescape
# Extract link (RSS uses 'link', Atom may use 'links')
link = entry.get('link', '')
if not link and 'links' in entry and len(entry['links']) > 0:
link = entry['links'][0].get('href', '')
# Generate ID from link or guid
entry_id = entry.get('id', entry.get('guid', link))
if not entry_id:
# Fallback: hash of title + link
entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
# Clean up ID to make it URL-safe
safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
# Extract timestamp
timestamp = 0
if 'published_parsed' in entry and entry['published_parsed']:
import time
timestamp = int(time.mktime(entry['published_parsed']))
elif 'updated_parsed' in entry and entry['updated_parsed']:
import time
timestamp = int(time.mktime(entry['updated_parsed']))
# Extract author
author = 'unknown'
if 'author' in entry:
author = entry['author']
elif 'author_detail' in entry:
author = entry['author_detail'].get('name', 'unknown')
# Extract content (try summary, then description, then content)
content = ''
if 'summary' in entry:
content = unescape(entry['summary'])
elif 'description' in entry:
content = unescape(entry['description'])
elif 'content' in entry and len(entry['content']) > 0:
content = unescape(entry['content'][0].get('value', ''))
# Strip HTML tags for cleaner content
import re
content = re.sub(r'<[^>]+>', '', content)
# Extract tags/categories
tags = []
if 'tags' in entry:
tags = [tag.get('term', '') for tag in entry['tags']]
return {
'platform': 'rss',
'id': f"rss_{safe_id}",
'title': unescape(entry.get('title', 'Untitled')),
'author': author,
'timestamp': timestamp,
'score': 0, # RSS doesn't have scores
'replies': 0, # RSS doesn't track comments
'url': link,
'content': content[:1000], # Limit content length
'source': feed_url,
'tags': tags,
'meta': {
'feed_url': feed_url,
'guid': entry.get('guid', '')
}
}
# ===== COMMENT FETCHERS =====
class comment_fetchers():
"""Functions to fetch comments for posts from various platforms"""
@@ -621,3 +703,42 @@ class data_methods():
# Fetch and convert
raw = data_methods.utils.http_get_json(url, params=params)
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
@staticmethod
def getRSSData(start_date, end_date, feed_url, max_posts):
"""
Fetch and parse RSS/Atom feeds.
Requires feedparser library: pip install feedparser
"""
try:
import feedparser
except ImportError:
print("Error: feedparser not installed. Run: pip install feedparser")
return []
try:
# Fetch and parse RSS feed
feed = feedparser.parse(feed_url)
# Check for errors
if hasattr(feed, 'bozo') and feed.bozo:
print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
# Extract entries
entries = feed.get('entries', [])[:max_posts]
if not entries:
print(f"No entries found in RSS feed: {feed_url}")
return []
# Convert entries to unified schema
posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
# Filter by date range
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
except Exception as e:
print(f"Error fetching RSS feed {feed_url}: {e}")
import traceback
traceback.print_exc()
return []

262
platform_config.json Normal file
View File

@@ -0,0 +1,262 @@
{
"platforms": {
"reddit": {
"name": "Reddit",
"icon": "🔺",
"color": "#ff4500",
"prefix": "r/",
"supports_communities": true,
"communities": [
{
"id": "programming",
"name": "Programming",
"display_name": "r/programming",
"icon": "💻",
"description": "General programming discussions"
},
{
"id": "python",
"name": "Python",
"display_name": "r/python",
"icon": "🐍",
"description": "Python programming language"
},
{
"id": "javascript",
"name": "JavaScript",
"display_name": "r/javascript",
"icon": "🟨",
"description": "JavaScript programming"
},
{
"id": "webdev",
"name": "Web Development",
"display_name": "r/webdev",
"icon": "🌐",
"description": "Web development discussions"
},
{
"id": "technology",
"name": "Technology",
"display_name": "r/technology",
"icon": "⚡",
"description": "Technology news and discussions"
}
]
},
"hackernews": {
"name": "Hacker News",
"icon": "🧮",
"color": "#ff6600",
"prefix": "",
"supports_communities": false,
"communities": [
{
"id": "front_page",
"name": "Front Page",
"display_name": "Hacker News",
"icon": "🧮",
"description": "Top stories from Hacker News"
},
{
"id": "new",
"name": "New Stories",
"display_name": "HN New",
"icon": "🆕",
"description": "Latest submissions"
},
{
"id": "ask",
"name": "Ask HN",
"display_name": "Ask HN",
"icon": "❓",
"description": "Questions for the community"
},
{
"id": "show",
"name": "Show HN",
"display_name": "Show HN",
"icon": "🎯",
"description": "User projects and demos"
}
]
},
"lobsters": {
"name": "Lobsters",
"icon": "🦞",
"color": "#800020",
"prefix": "",
"supports_communities": false,
"communities": [
{
"id": "all",
"name": "All Stories",
"display_name": "Lobsters",
"icon": "🦞",
"description": "All lobsters stories"
}
]
},
"github": {
"name": "GitHub",
"icon": "🐙",
"color": "#24292e",
"prefix": "",
"supports_communities": false,
"communities": [
{
"id": "trending",
"name": "Trending",
"display_name": "GitHub Trending",
"icon": "📈",
"description": "Trending repositories"
},
{
"id": "releases",
"name": "Releases",
"display_name": "New Releases",
"icon": "🎉",
"description": "Latest software releases"
}
]
},
"devto": {
"name": "Dev.to",
"icon": "📝",
"color": "#0a0a0a",
"prefix": "",
"supports_communities": false,
"communities": [
{
"id": "top",
"name": "Top Posts",
"display_name": "Dev.to Top",
"icon": "⭐",
"description": "Most popular dev posts"
},
{
"id": "latest",
"name": "Latest",
"display_name": "Dev.to Latest",
"icon": "🆕",
"description": "Recently published articles"
}
]
},
"stackoverflow": {
"name": "Stack Overflow",
"icon": "📚",
"color": "#f48024",
"prefix": "",
"supports_communities": false,
"communities": [
{
"id": "featured",
"name": "Featured",
"display_name": "SO Featured",
"icon": "⭐",
"description": "Featured questions"
},
{
"id": "newest",
"name": "Newest",
"display_name": "SO Newest",
"icon": "🆕",
"description": "Recent questions"
}
]
},
"rss": {
"name": "RSS Feeds",
"icon": "📡",
"color": "#ee802f",
"prefix": "",
"supports_communities": true,
"communities": [
{
"id": "https://hnrss.org/frontpage",
"name": "HN RSS",
"display_name": "HN RSS Feed",
"icon": "🧮",
"description": "Hacker News front page via RSS"
},
{
"id": "https://lobste.rs/rss",
"name": "Lobsters RSS",
"display_name": "Lobsters RSS Feed",
"icon": "🦞",
"description": "Lobsters community via RSS"
},
{
"id": "https://www.reddit.com/r/programming/.rss",
"name": "r/programming RSS",
"display_name": "r/programming RSS",
"icon": "💻",
"description": "Reddit programming subreddit via RSS"
}
]
}
},
"collection_targets": [
{
"platform": "reddit",
"community": "programming",
"max_posts": 75,
"priority": "high"
},
{
"platform": "reddit",
"community": "python",
"max_posts": 75,
"priority": "high"
},
{
"platform": "reddit",
"community": "javascript",
"max_posts": 50,
"priority": "medium"
},
{
"platform": "reddit",
"community": "webdev",
"max_posts": 50,
"priority": "medium"
},
{
"platform": "reddit",
"community": "technology",
"max_posts": 50,
"priority": "medium"
},
{
"platform": "hackernews",
"community": "front_page",
"max_posts": 100,
"priority": "high"
},
{
"platform": "hackernews",
"community": "ask",
"max_posts": 25,
"priority": "medium"
},
{
"platform": "hackernews",
"community": "show",
"max_posts": 25,
"priority": "medium"
},
{
"platform": "lobsters",
"community": "all",
"max_posts": 30,
"priority": "medium"
},
{
"platform": "rss",
"community": "https://hnrss.org/frontpage",
"max_posts": 50,
"priority": "low"
}
]
}

View File

@@ -11,3 +11,4 @@ sqlalchemy==2.0.36
authlib==1.3.2
APScheduler==3.10.4
praw==7.7.1
feedparser==6.0.12