Add RSS feed support to BalanceBoard
- Implement RSS/Atom feed parser using feedparser library - Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS) - Support both RSS 2.0 and Atom formats with automatic detection - Extract and normalize: title, author, link, content, tags, timestamps - HTML entity unescaping and tag stripping for clean content - Fallback handling for missing fields - Users can add any RSS feed URL as a collection source 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -80,6 +80,8 @@ class data_methods():
|
|||||||
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
|
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
|
||||||
elif platform == "stackexchange":
|
elif platform == "stackexchange":
|
||||||
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
|
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
|
||||||
|
elif platform == "rss":
|
||||||
|
return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
|
||||||
else:
|
else:
|
||||||
print("dataGrab.getData: platform not recognized")
|
print("dataGrab.getData: platform not recognized")
|
||||||
return None
|
return None
|
||||||
@@ -263,6 +265,11 @@ class data_methods():
|
|||||||
'order': 'desc'
|
'order': 'desc'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_rss_url(feed_url):
|
||||||
|
"""RSS feeds use the URL directly as provided in config"""
|
||||||
|
return feed_url
|
||||||
|
|
||||||
# ===== SCHEMA CONVERTERS =====
|
# ===== SCHEMA CONVERTERS =====
|
||||||
class converters():
|
class converters():
|
||||||
"""Functions to convert platform-specific data to unified schema"""
|
"""Functions to convert platform-specific data to unified schema"""
|
||||||
@@ -340,6 +347,81 @@ class data_methods():
|
|||||||
'meta': {'view_count': raw.get('view_count', 0)}
|
'meta': {'view_count': raw.get('view_count', 0)}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def rss_to_schema(entry, feed_url):
|
||||||
|
"""
|
||||||
|
Convert RSS/Atom feed entry to unified schema.
|
||||||
|
Supports both RSS 2.0 and Atom formats via feedparser.
|
||||||
|
"""
|
||||||
|
import hashlib
|
||||||
|
from html import unescape
|
||||||
|
|
||||||
|
# Extract link (RSS uses 'link', Atom may use 'links')
|
||||||
|
link = entry.get('link', '')
|
||||||
|
if not link and 'links' in entry and len(entry['links']) > 0:
|
||||||
|
link = entry['links'][0].get('href', '')
|
||||||
|
|
||||||
|
# Generate ID from link or guid
|
||||||
|
entry_id = entry.get('id', entry.get('guid', link))
|
||||||
|
if not entry_id:
|
||||||
|
# Fallback: hash of title + link
|
||||||
|
entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
|
||||||
|
|
||||||
|
# Clean up ID to make it URL-safe
|
||||||
|
safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
|
||||||
|
|
||||||
|
# Extract timestamp
|
||||||
|
timestamp = 0
|
||||||
|
if 'published_parsed' in entry and entry['published_parsed']:
|
||||||
|
import time
|
||||||
|
timestamp = int(time.mktime(entry['published_parsed']))
|
||||||
|
elif 'updated_parsed' in entry and entry['updated_parsed']:
|
||||||
|
import time
|
||||||
|
timestamp = int(time.mktime(entry['updated_parsed']))
|
||||||
|
|
||||||
|
# Extract author
|
||||||
|
author = 'unknown'
|
||||||
|
if 'author' in entry:
|
||||||
|
author = entry['author']
|
||||||
|
elif 'author_detail' in entry:
|
||||||
|
author = entry['author_detail'].get('name', 'unknown')
|
||||||
|
|
||||||
|
# Extract content (try summary, then description, then content)
|
||||||
|
content = ''
|
||||||
|
if 'summary' in entry:
|
||||||
|
content = unescape(entry['summary'])
|
||||||
|
elif 'description' in entry:
|
||||||
|
content = unescape(entry['description'])
|
||||||
|
elif 'content' in entry and len(entry['content']) > 0:
|
||||||
|
content = unescape(entry['content'][0].get('value', ''))
|
||||||
|
|
||||||
|
# Strip HTML tags for cleaner content
|
||||||
|
import re
|
||||||
|
content = re.sub(r'<[^>]+>', '', content)
|
||||||
|
|
||||||
|
# Extract tags/categories
|
||||||
|
tags = []
|
||||||
|
if 'tags' in entry:
|
||||||
|
tags = [tag.get('term', '') for tag in entry['tags']]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'platform': 'rss',
|
||||||
|
'id': f"rss_{safe_id}",
|
||||||
|
'title': unescape(entry.get('title', 'Untitled')),
|
||||||
|
'author': author,
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'score': 0, # RSS doesn't have scores
|
||||||
|
'replies': 0, # RSS doesn't track comments
|
||||||
|
'url': link,
|
||||||
|
'content': content[:1000], # Limit content length
|
||||||
|
'source': feed_url,
|
||||||
|
'tags': tags,
|
||||||
|
'meta': {
|
||||||
|
'feed_url': feed_url,
|
||||||
|
'guid': entry.get('guid', '')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
# ===== COMMENT FETCHERS =====
|
# ===== COMMENT FETCHERS =====
|
||||||
class comment_fetchers():
|
class comment_fetchers():
|
||||||
"""Functions to fetch comments for posts from various platforms"""
|
"""Functions to fetch comments for posts from various platforms"""
|
||||||
@@ -621,3 +703,42 @@ class data_methods():
|
|||||||
# Fetch and convert
|
# Fetch and convert
|
||||||
raw = data_methods.utils.http_get_json(url, params=params)
|
raw = data_methods.utils.http_get_json(url, params=params)
|
||||||
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
|
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def getRSSData(start_date, end_date, feed_url, max_posts):
|
||||||
|
"""
|
||||||
|
Fetch and parse RSS/Atom feeds.
|
||||||
|
Requires feedparser library: pip install feedparser
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import feedparser
|
||||||
|
except ImportError:
|
||||||
|
print("Error: feedparser not installed. Run: pip install feedparser")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Fetch and parse RSS feed
|
||||||
|
feed = feedparser.parse(feed_url)
|
||||||
|
|
||||||
|
# Check for errors
|
||||||
|
if hasattr(feed, 'bozo') and feed.bozo:
|
||||||
|
print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
|
||||||
|
|
||||||
|
# Extract entries
|
||||||
|
entries = feed.get('entries', [])[:max_posts]
|
||||||
|
|
||||||
|
if not entries:
|
||||||
|
print(f"No entries found in RSS feed: {feed_url}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Convert entries to unified schema
|
||||||
|
posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
|
||||||
|
|
||||||
|
# Filter by date range
|
||||||
|
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching RSS feed {feed_url}: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
return []
|
||||||
|
|||||||
262
platform_config.json
Normal file
262
platform_config.json
Normal file
@@ -0,0 +1,262 @@
|
|||||||
|
{
|
||||||
|
"platforms": {
|
||||||
|
"reddit": {
|
||||||
|
"name": "Reddit",
|
||||||
|
"icon": "🔺",
|
||||||
|
"color": "#ff4500",
|
||||||
|
"prefix": "r/",
|
||||||
|
"supports_communities": true,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "programming",
|
||||||
|
"name": "Programming",
|
||||||
|
"display_name": "r/programming",
|
||||||
|
"icon": "💻",
|
||||||
|
"description": "General programming discussions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "python",
|
||||||
|
"name": "Python",
|
||||||
|
"display_name": "r/python",
|
||||||
|
"icon": "🐍",
|
||||||
|
"description": "Python programming language"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "javascript",
|
||||||
|
"name": "JavaScript",
|
||||||
|
"display_name": "r/javascript",
|
||||||
|
"icon": "🟨",
|
||||||
|
"description": "JavaScript programming"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "webdev",
|
||||||
|
"name": "Web Development",
|
||||||
|
"display_name": "r/webdev",
|
||||||
|
"icon": "🌐",
|
||||||
|
"description": "Web development discussions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "technology",
|
||||||
|
"name": "Technology",
|
||||||
|
"display_name": "r/technology",
|
||||||
|
"icon": "⚡",
|
||||||
|
"description": "Technology news and discussions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"hackernews": {
|
||||||
|
"name": "Hacker News",
|
||||||
|
"icon": "🧮",
|
||||||
|
"color": "#ff6600",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": false,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "front_page",
|
||||||
|
"name": "Front Page",
|
||||||
|
"display_name": "Hacker News",
|
||||||
|
"icon": "🧮",
|
||||||
|
"description": "Top stories from Hacker News"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "new",
|
||||||
|
"name": "New Stories",
|
||||||
|
"display_name": "HN New",
|
||||||
|
"icon": "🆕",
|
||||||
|
"description": "Latest submissions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "ask",
|
||||||
|
"name": "Ask HN",
|
||||||
|
"display_name": "Ask HN",
|
||||||
|
"icon": "❓",
|
||||||
|
"description": "Questions for the community"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "show",
|
||||||
|
"name": "Show HN",
|
||||||
|
"display_name": "Show HN",
|
||||||
|
"icon": "🎯",
|
||||||
|
"description": "User projects and demos"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"lobsters": {
|
||||||
|
"name": "Lobsters",
|
||||||
|
"icon": "🦞",
|
||||||
|
"color": "#800020",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": false,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "all",
|
||||||
|
"name": "All Stories",
|
||||||
|
"display_name": "Lobsters",
|
||||||
|
"icon": "🦞",
|
||||||
|
"description": "All lobsters stories"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"github": {
|
||||||
|
"name": "GitHub",
|
||||||
|
"icon": "🐙",
|
||||||
|
"color": "#24292e",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": false,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "trending",
|
||||||
|
"name": "Trending",
|
||||||
|
"display_name": "GitHub Trending",
|
||||||
|
"icon": "📈",
|
||||||
|
"description": "Trending repositories"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "releases",
|
||||||
|
"name": "Releases",
|
||||||
|
"display_name": "New Releases",
|
||||||
|
"icon": "🎉",
|
||||||
|
"description": "Latest software releases"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"devto": {
|
||||||
|
"name": "Dev.to",
|
||||||
|
"icon": "📝",
|
||||||
|
"color": "#0a0a0a",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": false,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "top",
|
||||||
|
"name": "Top Posts",
|
||||||
|
"display_name": "Dev.to Top",
|
||||||
|
"icon": "⭐",
|
||||||
|
"description": "Most popular dev posts"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "latest",
|
||||||
|
"name": "Latest",
|
||||||
|
"display_name": "Dev.to Latest",
|
||||||
|
"icon": "🆕",
|
||||||
|
"description": "Recently published articles"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"stackoverflow": {
|
||||||
|
"name": "Stack Overflow",
|
||||||
|
"icon": "📚",
|
||||||
|
"color": "#f48024",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": false,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "featured",
|
||||||
|
"name": "Featured",
|
||||||
|
"display_name": "SO Featured",
|
||||||
|
"icon": "⭐",
|
||||||
|
"description": "Featured questions"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "newest",
|
||||||
|
"name": "Newest",
|
||||||
|
"display_name": "SO Newest",
|
||||||
|
"icon": "🆕",
|
||||||
|
"description": "Recent questions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"rss": {
|
||||||
|
"name": "RSS Feeds",
|
||||||
|
"icon": "📡",
|
||||||
|
"color": "#ee802f",
|
||||||
|
"prefix": "",
|
||||||
|
"supports_communities": true,
|
||||||
|
"communities": [
|
||||||
|
{
|
||||||
|
"id": "https://hnrss.org/frontpage",
|
||||||
|
"name": "HN RSS",
|
||||||
|
"display_name": "HN RSS Feed",
|
||||||
|
"icon": "🧮",
|
||||||
|
"description": "Hacker News front page via RSS"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "https://lobste.rs/rss",
|
||||||
|
"name": "Lobsters RSS",
|
||||||
|
"display_name": "Lobsters RSS Feed",
|
||||||
|
"icon": "🦞",
|
||||||
|
"description": "Lobsters community via RSS"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "https://www.reddit.com/r/programming/.rss",
|
||||||
|
"name": "r/programming RSS",
|
||||||
|
"display_name": "r/programming RSS",
|
||||||
|
"icon": "💻",
|
||||||
|
"description": "Reddit programming subreddit via RSS"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"collection_targets": [
|
||||||
|
{
|
||||||
|
"platform": "reddit",
|
||||||
|
"community": "programming",
|
||||||
|
"max_posts": 75,
|
||||||
|
"priority": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "reddit",
|
||||||
|
"community": "python",
|
||||||
|
"max_posts": 75,
|
||||||
|
"priority": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "reddit",
|
||||||
|
"community": "javascript",
|
||||||
|
"max_posts": 50,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "reddit",
|
||||||
|
"community": "webdev",
|
||||||
|
"max_posts": 50,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "reddit",
|
||||||
|
"community": "technology",
|
||||||
|
"max_posts": 50,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "hackernews",
|
||||||
|
"community": "front_page",
|
||||||
|
"max_posts": 100,
|
||||||
|
"priority": "high"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "hackernews",
|
||||||
|
"community": "ask",
|
||||||
|
"max_posts": 25,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "hackernews",
|
||||||
|
"community": "show",
|
||||||
|
"max_posts": 25,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "lobsters",
|
||||||
|
"community": "all",
|
||||||
|
"max_posts": 30,
|
||||||
|
"priority": "medium"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"platform": "rss",
|
||||||
|
"community": "https://hnrss.org/frontpage",
|
||||||
|
"max_posts": 50,
|
||||||
|
"priority": "low"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -11,3 +11,4 @@ sqlalchemy==2.0.36
|
|||||||
authlib==1.3.2
|
authlib==1.3.2
|
||||||
APScheduler==3.10.4
|
APScheduler==3.10.4
|
||||||
praw==7.7.1
|
praw==7.7.1
|
||||||
|
feedparser==6.0.12
|
||||||
|
|||||||
Reference in New Issue
Block a user