Add RSS feed support to BalanceBoard

- Implement RSS/Atom feed parser using feedparser library - Add RSS platform configuration with sample feeds (HN RSS, Lobsters RSS, Reddit RSS) - Support both RSS 2.0 and Atom formats with automatic detection - Extract and normalize: title, author, link, content, tags, timestamps - HTML entity unescaping and tag stripping for clean content - Fallback handling for missing fields - Users can add any RSS feed URL as a collection source 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-11 17:20:52 -05:00
parent e821a26b48
commit 47cca9d45e
3 changed files with 384 additions and 0 deletions
--- a/data_collection_lib.py
+++ b/data_collection_lib.py
@@ -80,6 +80,8 @@ class data_methods():
            return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
        elif platform == "stackexchange":
            return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
+        elif platform == "rss":
+            return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts)
        else:
            print("dataGrab.getData: platform not recognized")
            return None
@@ -263,6 +265,11 @@ class data_methods():
                'order': 'desc'
            }

+        @staticmethod
+        def build_rss_url(feed_url):
+            """RSS feeds use the URL directly as provided in config"""
+            return feed_url
+
    # ===== SCHEMA CONVERTERS =====
    class converters():
        """Functions to convert platform-specific data to unified schema"""
@@ -340,6 +347,81 @@ class data_methods():
                'meta': {'view_count': raw.get('view_count', 0)}
            }

+        @staticmethod
+        def rss_to_schema(entry, feed_url):
+            """
+            Convert RSS/Atom feed entry to unified schema.
+            Supports both RSS 2.0 and Atom formats via feedparser.
+            """
+            import hashlib
+            from html import unescape
+
+            # Extract link (RSS uses 'link', Atom may use 'links')
+            link = entry.get('link', '')
+            if not link and 'links' in entry and len(entry['links']) > 0:
+                link = entry['links'][0].get('href', '')
+
+            # Generate ID from link or guid
+            entry_id = entry.get('id', entry.get('guid', link))
+            if not entry_id:
+                # Fallback: hash of title + link
+                entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest()
+
+            # Clean up ID to make it URL-safe
+            safe_id = entry_id.replace('/', '_').replace(':', '_')[:100]
+
+            # Extract timestamp
+            timestamp = 0
+            if 'published_parsed' in entry and entry['published_parsed']:
+                import time
+                timestamp = int(time.mktime(entry['published_parsed']))
+            elif 'updated_parsed' in entry and entry['updated_parsed']:
+                import time
+                timestamp = int(time.mktime(entry['updated_parsed']))
+
+            # Extract author
+            author = 'unknown'
+            if 'author' in entry:
+                author = entry['author']
+            elif 'author_detail' in entry:
+                author = entry['author_detail'].get('name', 'unknown')
+
+            # Extract content (try summary, then description, then content)
+            content = ''
+            if 'summary' in entry:
+                content = unescape(entry['summary'])
+            elif 'description' in entry:
+                content = unescape(entry['description'])
+            elif 'content' in entry and len(entry['content']) > 0:
+                content = unescape(entry['content'][0].get('value', ''))
+
+            # Strip HTML tags for cleaner content
+            import re
+            content = re.sub(r'<[^>]+>', '', content)
+
+            # Extract tags/categories
+            tags = []
+            if 'tags' in entry:
+                tags = [tag.get('term', '') for tag in entry['tags']]
+
+            return {
+                'platform': 'rss',
+                'id': f"rss_{safe_id}",
+                'title': unescape(entry.get('title', 'Untitled')),
+                'author': author,
+                'timestamp': timestamp,
+                'score': 0,  # RSS doesn't have scores
+                'replies': 0,  # RSS doesn't track comments
+                'url': link,
+                'content': content[:1000],  # Limit content length
+                'source': feed_url,
+                'tags': tags,
+                'meta': {
+                    'feed_url': feed_url,
+                    'guid': entry.get('guid', '')
+                }
+            }
+
    # ===== COMMENT FETCHERS =====
    class comment_fetchers():
        """Functions to fetch comments for posts from various platforms"""
@@ -621,3 +703,42 @@ class data_methods():
            # Fetch and convert
            raw = data_methods.utils.http_get_json(url, params=params)
            return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
+
+        @staticmethod
+        def getRSSData(start_date, end_date, feed_url, max_posts):
+            """
+            Fetch and parse RSS/Atom feeds.
+            Requires feedparser library: pip install feedparser
+            """
+            try:
+                import feedparser
+            except ImportError:
+                print("Error: feedparser not installed. Run: pip install feedparser")
+                return []
+
+            try:
+                # Fetch and parse RSS feed
+                feed = feedparser.parse(feed_url)
+
+                # Check for errors
+                if hasattr(feed, 'bozo') and feed.bozo:
+                    print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}")
+
+                # Extract entries
+                entries = feed.get('entries', [])[:max_posts]
+
+                if not entries:
+                    print(f"No entries found in RSS feed: {feed_url}")
+                    return []
+
+                # Convert entries to unified schema
+                posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries]
+
+                # Filter by date range
+                return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
+
+            except Exception as e:
+                print(f"Error fetching RSS feed {feed_url}: {e}")
+                import traceback
+                traceback.print_exc()
+                return []