Add RSS feed support to BalanceBoard
- Implement RSS/Atom feed parser using feedparser library - Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS) - Support both RSS 2.0 and Atom formats with automatic detection - Extract and normalize: title, author, link, content, tags, timestamps - HTML entity unescaping and tag stripping for clean content - Fallback handling for missing fields - Users can add any RSS feed URL as a collection source 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,8 @@ class data_methods():
|
||||
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
|
||||
elif platform == "stackexchange":
|
||||
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
|
||||
elif platform == "rss":
|
||||
return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
|
||||
else:
|
||||
print("dataGrab.getData: platform not recognized")
|
||||
return None
|
||||
@@ -263,6 +265,11 @@ class data_methods():
|
||||
'order': 'desc'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def build_rss_url(feed_url):
|
||||
"""RSS feeds use the URL directly as provided in config"""
|
||||
return feed_url
|
||||
|
||||
# ===== SCHEMA CONVERTERS =====
|
||||
class converters():
|
||||
"""Functions to convert platform-specific data to unified schema"""
|
||||
@@ -340,6 +347,81 @@ class data_methods():
|
||||
'meta': {'view_count': raw.get('view_count', 0)}
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def rss_to_schema(entry, feed_url):
|
||||
"""
|
||||
Convert RSS/Atom feed entry to unified schema.
|
||||
Supports both RSS 2.0 and Atom formats via feedparser.
|
||||
"""
|
||||
import hashlib
|
||||
from html import unescape
|
||||
|
||||
# Extract link (RSS uses 'link', Atom may use 'links')
|
||||
link = entry.get('link', '')
|
||||
if not link and 'links' in entry and len(entry['links']) > 0:
|
||||
link = entry['links'][0].get('href', '')
|
||||
|
||||
# Generate ID from link or guid
|
||||
entry_id = entry.get('id', entry.get('guid', link))
|
||||
if not entry_id:
|
||||
# Fallback: hash of title + link
|
||||
entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
|
||||
|
||||
# Clean up ID to make it URL-safe
|
||||
safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
|
||||
|
||||
# Extract timestamp
|
||||
timestamp = 0
|
||||
if 'published_parsed' in entry and entry['published_parsed']:
|
||||
import time
|
||||
timestamp = int(time.mktime(entry['published_parsed']))
|
||||
elif 'updated_parsed' in entry and entry['updated_parsed']:
|
||||
import time
|
||||
timestamp = int(time.mktime(entry['updated_parsed']))
|
||||
|
||||
# Extract author
|
||||
author = 'unknown'
|
||||
if 'author' in entry:
|
||||
author = entry['author']
|
||||
elif 'author_detail' in entry:
|
||||
author = entry['author_detail'].get('name', 'unknown')
|
||||
|
||||
# Extract content (try summary, then description, then content)
|
||||
content = ''
|
||||
if 'summary' in entry:
|
||||
content = unescape(entry['summary'])
|
||||
elif 'description' in entry:
|
||||
content = unescape(entry['description'])
|
||||
elif 'content' in entry and len(entry['content']) > 0:
|
||||
content = unescape(entry['content'][0].get('value', ''))
|
||||
|
||||
# Strip HTML tags for cleaner content
|
||||
import re
|
||||
content = re.sub(r'<[^>]+>', '', content)
|
||||
|
||||
# Extract tags/categories
|
||||
tags = []
|
||||
if 'tags' in entry:
|
||||
tags = [tag.get('term', '') for tag in entry['tags']]
|
||||
|
||||
return {
|
||||
'platform': 'rss',
|
||||
'id': f"rss_{safe_id}",
|
||||
'title': unescape(entry.get('title', 'Untitled')),
|
||||
'author': author,
|
||||
'timestamp': timestamp,
|
||||
'score': 0, # RSS doesn't have scores
|
||||
'replies': 0, # RSS doesn't track comments
|
||||
'url': link,
|
||||
'content': content[:1000], # Limit content length
|
||||
'source': feed_url,
|
||||
'tags': tags,
|
||||
'meta': {
|
||||
'feed_url': feed_url,
|
||||
'guid': entry.get('guid', '')
|
||||
}
|
||||
}
|
||||
|
||||
# ===== COMMENT FETCHERS =====
|
||||
class comment_fetchers():
|
||||
"""Functions to fetch comments for posts from various platforms"""
|
||||
@@ -621,3 +703,42 @@ class data_methods():
|
||||
# Fetch and convert
|
||||
raw = data_methods.utils.http_get_json(url, params=params)
|
||||
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
|
||||
|
||||
@staticmethod
|
||||
def getRSSData(start_date, end_date, feed_url, max_posts):
|
||||
"""
|
||||
Fetch and parse RSS/Atom feeds.
|
||||
Requires feedparser library: pip install feedparser
|
||||
"""
|
||||
try:
|
||||
import feedparser
|
||||
except ImportError:
|
||||
print("Error: feedparser not installed. Run: pip install feedparser")
|
||||
return []
|
||||
|
||||
try:
|
||||
# Fetch and parse RSS feed
|
||||
feed = feedparser.parse(feed_url)
|
||||
|
||||
# Check for errors
|
||||
if hasattr(feed, 'bozo') and feed.bozo:
|
||||
print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
|
||||
|
||||
# Extract entries
|
||||
entries = feed.get('entries', [])[:max_posts]
|
||||
|
||||
if not entries:
|
||||
print(f"No entries found in RSS feed: {feed_url}")
|
||||
return []
|
||||
|
||||
# Convert entries to unified schema
|
||||
posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
|
||||
|
||||
# Filter by date range
|
||||
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching RSS feed {feed_url}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return []
|
||||
|
||||
262
platform_config.json
Normal file
262
platform_config.json
Normal file
@@ -0,0 +1,262 @@
|
||||
{
|
||||
"platforms": {
|
||||
"reddit": {
|
||||
"name": "Reddit",
|
||||
"icon": "🔺",
|
||||
"color": "#ff4500",
|
||||
"prefix": "r/",
|
||||
"supports_communities": true,
|
||||
"communities": [
|
||||
{
|
||||
"id": "programming",
|
||||
"name": "Programming",
|
||||
"display_name": "r/programming",
|
||||
"icon": "💻",
|
||||
"description": "General programming discussions"
|
||||
},
|
||||
{
|
||||
"id": "python",
|
||||
"name": "Python",
|
||||
"display_name": "r/python",
|
||||
"icon": "🐍",
|
||||
"description": "Python programming language"
|
||||
},
|
||||
{
|
||||
"id": "javascript",
|
||||
"name": "JavaScript",
|
||||
"display_name": "r/javascript",
|
||||
"icon": "🟨",
|
||||
"description": "JavaScript programming"
|
||||
},
|
||||
{
|
||||
"id": "webdev",
|
||||
"name": "Web Development",
|
||||
"display_name": "r/webdev",
|
||||
"icon": "🌐",
|
||||
"description": "Web development discussions"
|
||||
},
|
||||
{
|
||||
"id": "technology",
|
||||
"name": "Technology",
|
||||
"display_name": "r/technology",
|
||||
"icon": "⚡",
|
||||
"description": "Technology news and discussions"
|
||||
}
|
||||
]
|
||||
},
|
||||
"hackernews": {
|
||||
"name": "Hacker News",
|
||||
"icon": "🧮",
|
||||
"color": "#ff6600",
|
||||
"prefix": "",
|
||||
"supports_communities": false,
|
||||
"communities": [
|
||||
{
|
||||
"id": "front_page",
|
||||
"name": "Front Page",
|
||||
"display_name": "Hacker News",
|
||||
"icon": "🧮",
|
||||
"description": "Top stories from Hacker News"
|
||||
},
|
||||
{
|
||||
"id": "new",
|
||||
"name": "New Stories",
|
||||
"display_name": "HN New",
|
||||
"icon": "🆕",
|
||||
"description": "Latest submissions"
|
||||
},
|
||||
{
|
||||
"id": "ask",
|
||||
"name": "Ask HN",
|
||||
"display_name": "Ask HN",
|
||||
"icon": "❓",
|
||||
"description": "Questions for the community"
|
||||
},
|
||||
{
|
||||
"id": "show",
|
||||
"name": "Show HN",
|
||||
"display_name": "Show HN",
|
||||
"icon": "🎯",
|
||||
"description": "User projects and demos"
|
||||
}
|
||||
]
|
||||
},
|
||||
"lobsters": {
|
||||
"name": "Lobsters",
|
||||
"icon": "🦞",
|
||||
"color": "#800020",
|
||||
"prefix": "",
|
||||
"supports_communities": false,
|
||||
"communities": [
|
||||
{
|
||||
"id": "all",
|
||||
"name": "All Stories",
|
||||
"display_name": "Lobsters",
|
||||
"icon": "🦞",
|
||||
"description": "All lobsters stories"
|
||||
}
|
||||
]
|
||||
},
|
||||
"github": {
|
||||
"name": "GitHub",
|
||||
"icon": "🐙",
|
||||
"color": "#24292e",
|
||||
"prefix": "",
|
||||
"supports_communities": false,
|
||||
"communities": [
|
||||
{
|
||||
"id": "trending",
|
||||
"name": "Trending",
|
||||
"display_name": "GitHub Trending",
|
||||
"icon": "📈",
|
||||
"description": "Trending repositories"
|
||||
},
|
||||
{
|
||||
"id": "releases",
|
||||
"name": "Releases",
|
||||
"display_name": "New Releases",
|
||||
"icon": "🎉",
|
||||
"description": "Latest software releases"
|
||||
}
|
||||
]
|
||||
},
|
||||
"devto": {
|
||||
"name": "Dev.to",
|
||||
"icon": "📝",
|
||||
"color": "#0a0a0a",
|
||||
"prefix": "",
|
||||
"supports_communities": false,
|
||||
"communities": [
|
||||
{
|
||||
"id": "top",
|
||||
"name": "Top Posts",
|
||||
"display_name": "Dev.to Top",
|
||||
"icon": "⭐",
|
||||
"description": "Most popular dev posts"
|
||||
},
|
||||
{
|
||||
"id": "latest",
|
||||
"name": "Latest",
|
||||
"display_name": "Dev.to Latest",
|
||||
"icon": "🆕",
|
||||
"description": "Recently published articles"
|
||||
}
|
||||
]
|
||||
},
|
||||
"stackoverflow": {
|
||||
"name": "Stack Overflow",
|
||||
"icon": "📚",
|
||||
"color": "#f48024",
|
||||
"prefix": "",
|
||||
"supports_communities": false,
|
||||
"communities": [
|
||||
{
|
||||
"id": "featured",
|
||||
"name": "Featured",
|
||||
"display_name": "SO Featured",
|
||||
"icon": "⭐",
|
||||
"description": "Featured questions"
|
||||
},
|
||||
{
|
||||
"id": "newest",
|
||||
"name": "Newest",
|
||||
"display_name": "SO Newest",
|
||||
"icon": "🆕",
|
||||
"description": "Recent questions"
|
||||
}
|
||||
]
|
||||
},
|
||||
"rss": {
|
||||
"name": "RSS Feeds",
|
||||
"icon": "📡",
|
||||
"color": "#ee802f",
|
||||
"prefix": "",
|
||||
"supports_communities": true,
|
||||
"communities": [
|
||||
{
|
||||
"id": "https://hnrss.org/frontpage",
|
||||
"name": "HN RSS",
|
||||
"display_name": "HN RSS Feed",
|
||||
"icon": "🧮",
|
||||
"description": "Hacker News front page via RSS"
|
||||
},
|
||||
{
|
||||
"id": "https://lobste.rs/rss",
|
||||
"name": "Lobsters RSS",
|
||||
"display_name": "Lobsters RSS Feed",
|
||||
"icon": "🦞",
|
||||
"description": "Lobsters community via RSS"
|
||||
},
|
||||
{
|
||||
"id": "https://www.reddit.com/r/programming/.rss",
|
||||
"name": "r/programming RSS",
|
||||
"display_name": "r/programming RSS",
|
||||
"icon": "💻",
|
||||
"description": "Reddit programming subreddit via RSS"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"collection_targets": [
|
||||
{
|
||||
"platform": "reddit",
|
||||
"community": "programming",
|
||||
"max_posts": 75,
|
||||
"priority": "high"
|
||||
},
|
||||
{
|
||||
"platform": "reddit",
|
||||
"community": "python",
|
||||
"max_posts": 75,
|
||||
"priority": "high"
|
||||
},
|
||||
{
|
||||
"platform": "reddit",
|
||||
"community": "javascript",
|
||||
"max_posts": 50,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "reddit",
|
||||
"community": "webdev",
|
||||
"max_posts": 50,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "reddit",
|
||||
"community": "technology",
|
||||
"max_posts": 50,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "hackernews",
|
||||
"community": "front_page",
|
||||
"max_posts": 100,
|
||||
"priority": "high"
|
||||
},
|
||||
{
|
||||
"platform": "hackernews",
|
||||
"community": "ask",
|
||||
"max_posts": 25,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "hackernews",
|
||||
"community": "show",
|
||||
"max_posts": 25,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "lobsters",
|
||||
"community": "all",
|
||||
"max_posts": 30,
|
||||
"priority": "medium"
|
||||
},
|
||||
{
|
||||
"platform": "rss",
|
||||
"community": "https://hnrss.org/frontpage",
|
||||
"max_posts": 50,
|
||||
"priority": "low"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -11,3 +11,4 @@ sqlalchemy==2.0.36
|
||||
authlib==1.3.2
|
||||
APScheduler==3.10.4
|
||||
praw==7.7.1
|
||||
feedparser==6.0.12
|
||||
|
||||
Reference in New Issue
Block a user