Add RSS feed support to BalanceBoard

- Implement RSS/Atom feed parser using feedparser library - Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS) - Support both RSS 2.0 and Atom formats with automatic detection - Extract and normalize: title, author, link, content, tags, timestamps - HTML entity unescaping and tag stripping for clean content - Fallback handling for missing fields - Users can add any RSS feed URL as a collection source 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-11 17:20:52 -05:00
parent e821a26b48
commit 47cca9d45e
3 changed files with 384 additions and 0 deletions
--- a/data_collection_lib.py
+++ b/data_collection_lib.py
@@ -80,6 +80,8 @@ class data_methods():
            return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
        elif platform == "stackexchange":
            return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
+        elif platform == "rss":
+            return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
        else:
            print("dataGrab.getData: platform not recognized")
            return None
@@ -263,6 +265,11 @@ class data_methods():
                'order': 'desc'
            }

+        @staticmethod
+        def build_rss_url(feed_url):
+            """RSS feeds use the URL directly as provided in config"""
+            return feed_url
+
    # ===== SCHEMA CONVERTERS =====
    class converters():
        """Functions to convert platform-specific data to unified schema"""
@@ -340,6 +347,81 @@ class data_methods():
                'meta': {'view_count': raw.get('view_count', 0)}
            }

+        @staticmethod
+        def rss_to_schema(entry, feed_url):
+            """
+            Convert RSS/Atom feed entry to unified schema.
+            Supports both RSS 2.0 and Atom formats via feedparser.
+            """
+            import hashlib
+            from html import unescape
+
+            # Extract link (RSS uses 'link', Atom may use 'links')
+            link = entry.get('link', '')
+            if not link and 'links' in entry and len(entry['links']) > 0:
+                link = entry['links'][0].get('href', '')
+
+            # Generate ID from link or guid
+            entry_id = entry.get('id', entry.get('guid', link))
+            if not entry_id:
+                # Fallback: hash of title + link
+                entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
+
+            # Clean up ID to make it URL-safe
+            safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
+
+            # Extract timestamp
+            timestamp = 0
+            if 'published_parsed' in entry and entry['published_parsed']:
+                import time
+                timestamp = int(time.mktime(entry['published_parsed']))
+            elif 'updated_parsed' in entry and entry['updated_parsed']:
+                import time
+                timestamp = int(time.mktime(entry['updated_parsed']))
+
+            # Extract author
+            author = 'unknown'
+            if 'author' in entry:
+                author = entry['author']
+            elif 'author_detail' in entry:
+                author = entry['author_detail'].get('name', 'unknown')
+
+            # Extract content (try summary, then description, then content)
+            content = ''
+            if 'summary' in entry:
+                content = unescape(entry['summary'])
+            elif 'description' in entry:
+                content = unescape(entry['description'])
+            elif 'content' in entry and len(entry['content']) > 0:
+                content = unescape(entry['content'][0].get('value', ''))
+
+            # Strip HTML tags for cleaner content
+            import re
+            content = re.sub(r'<[^>]+>', '', content)
+
+            # Extract tags/categories
+            tags = []
+            if 'tags' in entry:
+                tags = [tag.get('term', '') for tag in entry['tags']]
+
+            return {
+                'platform': 'rss',
+                'id': f"rss_{safe_id}",
+                'title': unescape(entry.get('title', 'Untitled')),
+                'author': author,
+                'timestamp': timestamp,
+                'score': 0,  # RSS doesn't have scores
+                'replies': 0,  # RSS doesn't track comments
+                'url': link,
+                'content': content[:1000],  # Limit content length
+                'source': feed_url,
+                'tags': tags,
+                'meta': {
+                    'feed_url': feed_url,
+                    'guid': entry.get('guid', '')
+                }
+            }
+
    # ===== COMMENT FETCHERS =====
    class comment_fetchers():
        """Functions to fetch comments for posts from various platforms"""
@@ -621,3 +703,42 @@ class data_methods():
            # Fetch and convert
            raw = data_methods.utils.http_get_json(url, params=params)
            return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
+
+        @staticmethod
+        def getRSSData(start_date, end_date, feed_url, max_posts):
+            """
+            Fetch and parse RSS/Atom feeds.
+            Requires feedparser library: pip install feedparser
+            """
+            try:
+                import feedparser
+            except ImportError:
+                print("Error: feedparser not installed. Run: pip install feedparser")
+                return []
+
+            try:
+                # Fetch and parse RSS feed
+                feed = feedparser.parse(feed_url)
+
+                # Check for errors
+                if hasattr(feed, 'bozo') and feed.bozo:
+                    print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
+
+                # Extract entries
+                entries = feed.get('entries', [])[:max_posts]
+
+                if not entries:
+                    print(f"No entries found in RSS feed: {feed_url}")
+                    return []
+
+                # Convert entries to unified schema
+                posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
+
+                # Filter by date range
+                return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
+
+            except Exception as e:
+                print(f"Error fetching RSS feed {feed_url}: {e}")
+                import traceback
+                traceback.print_exc()
+                return []
--- a/platform_config.json
+++ b/platform_config.json
@@ -0,0 +1,262 @@
+{
+  "platforms": {
+    "reddit": {
+      "name": "Reddit",
+      "icon": "🔺",
+      "color": "#ff4500",
+      "prefix": "r/",
+      "supports_communities": true,
+      "communities": [
+        {
+          "id": "programming",
+          "name": "Programming",
+          "display_name": "r/programming",
+          "icon": "💻",
+          "description": "General programming discussions"
+        },
+        {
+          "id": "python",
+          "name": "Python",
+          "display_name": "r/python",
+          "icon": "🐍",
+          "description": "Python programming language"
+        },
+        {
+          "id": "javascript",
+          "name": "JavaScript",
+          "display_name": "r/javascript",
+          "icon": "🟨",
+          "description": "JavaScript programming"
+        },
+        {
+          "id": "webdev",
+          "name": "Web Development",
+          "display_name": "r/webdev",
+          "icon": "🌐",
+          "description": "Web development discussions"
+        },
+        {
+          "id": "technology",
+          "name": "Technology",
+          "display_name": "r/technology",
+          "icon": "⚡",
+          "description": "Technology news and discussions"
+        }
+      ]
+    },
+    "hackernews": {
+      "name": "Hacker News",
+      "icon": "🧮",
+      "color": "#ff6600",
+      "prefix": "",
+      "supports_communities": false,
+      "communities": [
+        {
+          "id": "front_page",
+          "name": "Front Page",
+          "display_name": "Hacker News",
+          "icon": "🧮",
+          "description": "Top stories from Hacker News"
+        },
+        {
+          "id": "new",
+          "name": "New Stories",
+          "display_name": "HN New",
+          "icon": "🆕",
+          "description": "Latest submissions"
+        },
+        {
+          "id": "ask",
+          "name": "Ask HN",
+          "display_name": "Ask HN",
+          "icon": "❓",
+          "description": "Questions for the community"
+        },
+        {
+          "id": "show",
+          "name": "Show HN",
+          "display_name": "Show HN",
+          "icon": "🎯",
+          "description": "User projects and demos"
+        }
+      ]
+    },
+    "lobsters": {
+      "name": "Lobsters",
+      "icon": "🦞",
+      "color": "#800020",
+      "prefix": "",
+      "supports_communities": false,
+      "communities": [
+        {
+          "id": "all",
+          "name": "All Stories",
+          "display_name": "Lobsters",
+          "icon": "🦞",
+          "description": "All lobsters stories"
+        }
+      ]
+    },
+    "github": {
+      "name": "GitHub",
+      "icon": "🐙",
+      "color": "#24292e",
+      "prefix": "",
+      "supports_communities": false,
+      "communities": [
+        {
+          "id": "trending",
+          "name": "Trending",
+          "display_name": "GitHub Trending",
+          "icon": "📈",
+          "description": "Trending repositories"
+        },
+        {
+          "id": "releases",
+          "name": "Releases",
+          "display_name": "New Releases",
+          "icon": "🎉",
+          "description": "Latest software releases"
+        }
+      ]
+    },
+    "devto": {
+      "name": "Dev.to",
+      "icon": "📝",
+      "color": "#0a0a0a",
+      "prefix": "",
+      "supports_communities": false,
+      "communities": [
+        {
+          "id": "top",
+          "name": "Top Posts",
+          "display_name": "Dev.to Top",
+          "icon": "⭐",
+          "description": "Most popular dev posts"
+        },
+        {
+          "id": "latest",
+          "name": "Latest",
+          "display_name": "Dev.to Latest",
+          "icon": "🆕",
+          "description": "Recently published articles"
+        }
+      ]
+    },
+    "stackoverflow": {
+      "name": "Stack Overflow",
+      "icon": "📚",
+      "color": "#f48024",
+      "prefix": "",
+      "supports_communities": false,
+      "communities": [
+        {
+          "id": "featured",
+          "name": "Featured",
+          "display_name": "SO Featured",
+          "icon": "⭐",
+          "description": "Featured questions"
+        },
+        {
+          "id": "newest",
+          "name": "Newest",
+          "display_name": "SO Newest",
+          "icon": "🆕",
+          "description": "Recent questions"
+        }
+      ]
+    },
+    "rss": {
+      "name": "RSS Feeds",
+      "icon": "📡",
+      "color": "#ee802f",
+      "prefix": "",
+      "supports_communities": true,
+      "communities": [
+        {
+          "id": "https://hnrss.org/frontpage",
+          "name": "HN RSS",
+          "display_name": "HN RSS Feed",
+          "icon": "🧮",
+          "description": "Hacker News front page via RSS"
+        },
+        {
+          "id": "https://lobste.rs/rss",
+          "name": "Lobsters RSS",
+          "display_name": "Lobsters RSS Feed",
+          "icon": "🦞",
+          "description": "Lobsters community via RSS"
+        },
+        {
+          "id": "https://www.reddit.com/r/programming/.rss",
+          "name": "r/programming RSS",
+          "display_name": "r/programming RSS",
+          "icon": "💻",
+          "description": "Reddit programming subreddit via RSS"
+        }
+      ]
+    }
+  },
+  "collection_targets": [
+    {
+      "platform": "reddit",
+      "community": "programming",
+      "max_posts": 75,
+      "priority": "high"
+    },
+    {
+      "platform": "reddit", 
+      "community": "python",
+      "max_posts": 75,
+      "priority": "high"
+    },
+    {
+      "platform": "reddit",
+      "community": "javascript", 
+      "max_posts": 50,
+      "priority": "medium"
+    },
+    {
+      "platform": "reddit",
+      "community": "webdev",
+      "max_posts": 50,
+      "priority": "medium"
+    },
+    {
+      "platform": "reddit",
+      "community": "technology",
+      "max_posts": 50,
+      "priority": "medium"
+    },
+    {
+      "platform": "hackernews",
+      "community": "front_page",
+      "max_posts": 100,
+      "priority": "high"
+    },
+    {
+      "platform": "hackernews",
+      "community": "ask",
+      "max_posts": 25,
+      "priority": "medium"
+    },
+    {
+      "platform": "hackernews",
+      "community": "show",
+      "max_posts": 25,
+      "priority": "medium"
+    },
+    {
+      "platform": "lobsters",
+      "community": "all",
+      "max_posts": 30,
+      "priority": "medium"
+    },
+    {
+      "platform": "rss",
+      "community": "https://hnrss.org/frontpage",
+      "max_posts": 50,
+      "priority": "low"
+    }
+  ]
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ sqlalchemy==2.0.36
 authlib==1.3.2
 APScheduler==3.10.4
 praw==7.7.1
+feedparser==6.0.12