diff --git a/data_collection_lib.py b/data_collection_lib.py index 43d6878..dd30194 100644 --- a/data_collection_lib.py +++ b/data_collection_lib.py @@ -80,6 +80,8 @@ class data_methods(): return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts) elif platform == "stackexchange": return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts) + elif platform == "rss": + return data_methods.fetchers.getRSSData(start_date, end_date, community, max_posts) else: print("dataGrab.getData: platform not recognized") return None @@ -263,6 +265,11 @@ class data_methods(): 'order': 'desc' } + @staticmethod + def build_rss_url(feed_url): + """RSS feeds use the URL directly as provided in config""" + return feed_url + # ===== SCHEMA CONVERTERS ===== class converters(): """Functions to convert platform-specific data to unified schema""" @@ -340,6 +347,81 @@ class data_methods(): 'meta': {'view_count': raw.get('view_count', 0)} } + @staticmethod + def rss_to_schema(entry, feed_url): + """ + Convert RSS/Atom feed entry to unified schema. + Supports both RSS 2.0 and Atom formats via feedparser. + """ + import hashlib + from html import unescape + + # Extract link (RSS uses 'link', Atom may use 'links') + link = entry.get('link', '') + if not link and 'links' in entry and len(entry['links']) > 0: + link = entry['links'][0].get('href', '') + + # Generate ID from link or guid + entry_id = entry.get('id', entry.get('guid', link)) + if not entry_id: + # Fallback: hash of title + link + entry_id = hashlib.md5(f"{entry.get('title', '')}{link}".encode()).hexdigest() + + # Clean up ID to make it URL-safe + safe_id = entry_id.replace('/', '_').replace(':', '_')[:100] + + # Extract timestamp + timestamp = 0 + if 'published_parsed' in entry and entry['published_parsed']: + import time + timestamp = int(time.mktime(entry['published_parsed'])) + elif 'updated_parsed' in entry and entry['updated_parsed']: + import time + timestamp = int(time.mktime(entry['updated_parsed'])) + + # Extract author + author = 'unknown' + if 'author' in entry: + author = entry['author'] + elif 'author_detail' in entry: + author = entry['author_detail'].get('name', 'unknown') + + # Extract content (try summary, then description, then content) + content = '' + if 'summary' in entry: + content = unescape(entry['summary']) + elif 'description' in entry: + content = unescape(entry['description']) + elif 'content' in entry and len(entry['content']) > 0: + content = unescape(entry['content'][0].get('value', '')) + + # Strip HTML tags for cleaner content + import re + content = re.sub(r'<[^>]+>', '', content) + + # Extract tags/categories + tags = [] + if 'tags' in entry: + tags = [tag.get('term', '') for tag in entry['tags']] + + return { + 'platform': 'rss', + 'id': f"rss_{safe_id}", + 'title': unescape(entry.get('title', 'Untitled')), + 'author': author, + 'timestamp': timestamp, + 'score': 0, # RSS doesn't have scores + 'replies': 0, # RSS doesn't track comments + 'url': link, + 'content': content[:1000], # Limit content length + 'source': feed_url, + 'tags': tags, + 'meta': { + 'feed_url': feed_url, + 'guid': entry.get('guid', '') + } + } + # ===== COMMENT FETCHERS ===== class comment_fetchers(): """Functions to fetch comments for posts from various platforms""" @@ -621,3 +703,42 @@ class data_methods(): # Fetch and convert raw = data_methods.utils.http_get_json(url, params=params) return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])] + + @staticmethod + def getRSSData(start_date, end_date, feed_url, max_posts): + """ + Fetch and parse RSS/Atom feeds. + Requires feedparser library: pip install feedparser + """ + try: + import feedparser + except ImportError: + print("Error: feedparser not installed. Run: pip install feedparser") + return [] + + try: + # Fetch and parse RSS feed + feed = feedparser.parse(feed_url) + + # Check for errors + if hasattr(feed, 'bozo') and feed.bozo: + print(f"Warning: RSS feed may have issues: {feed.get('bozo_exception', 'Unknown error')}") + + # Extract entries + entries = feed.get('entries', [])[:max_posts] + + if not entries: + print(f"No entries found in RSS feed: {feed_url}") + return [] + + # Convert entries to unified schema + posts = [data_methods.converters.rss_to_schema(entry, feed_url) for entry in entries] + + # Filter by date range + return data_methods.utils.filter_by_date_range(posts, start_date, end_date) + + except Exception as e: + print(f"Error fetching RSS feed {feed_url}: {e}") + import traceback + traceback.print_exc() + return [] diff --git a/platform_config.json b/platform_config.json new file mode 100644 index 0000000..6b9b6bc --- /dev/null +++ b/platform_config.json @@ -0,0 +1,262 @@ +{ + "platforms": { + "reddit": { + "name": "Reddit", + "icon": "🔺", + "color": "#ff4500", + "prefix": "r/", + "supports_communities": true, + "communities": [ + { + "id": "programming", + "name": "Programming", + "display_name": "r/programming", + "icon": "💻", + "description": "General programming discussions" + }, + { + "id": "python", + "name": "Python", + "display_name": "r/python", + "icon": "🐍", + "description": "Python programming language" + }, + { + "id": "javascript", + "name": "JavaScript", + "display_name": "r/javascript", + "icon": "🟨", + "description": "JavaScript programming" + }, + { + "id": "webdev", + "name": "Web Development", + "display_name": "r/webdev", + "icon": "🌐", + "description": "Web development discussions" + }, + { + "id": "technology", + "name": "Technology", + "display_name": "r/technology", + "icon": "⚡", + "description": "Technology news and discussions" + } + ] + }, + "hackernews": { + "name": "Hacker News", + "icon": "🧮", + "color": "#ff6600", + "prefix": "", + "supports_communities": false, + "communities": [ + { + "id": "front_page", + "name": "Front Page", + "display_name": "Hacker News", + "icon": "🧮", + "description": "Top stories from Hacker News" + }, + { + "id": "new", + "name": "New Stories", + "display_name": "HN New", + "icon": "🆕", + "description": "Latest submissions" + }, + { + "id": "ask", + "name": "Ask HN", + "display_name": "Ask HN", + "icon": "❓", + "description": "Questions for the community" + }, + { + "id": "show", + "name": "Show HN", + "display_name": "Show HN", + "icon": "🎯", + "description": "User projects and demos" + } + ] + }, + "lobsters": { + "name": "Lobsters", + "icon": "🦞", + "color": "#800020", + "prefix": "", + "supports_communities": false, + "communities": [ + { + "id": "all", + "name": "All Stories", + "display_name": "Lobsters", + "icon": "🦞", + "description": "All lobsters stories" + } + ] + }, + "github": { + "name": "GitHub", + "icon": "🐙", + "color": "#24292e", + "prefix": "", + "supports_communities": false, + "communities": [ + { + "id": "trending", + "name": "Trending", + "display_name": "GitHub Trending", + "icon": "📈", + "description": "Trending repositories" + }, + { + "id": "releases", + "name": "Releases", + "display_name": "New Releases", + "icon": "🎉", + "description": "Latest software releases" + } + ] + }, + "devto": { + "name": "Dev.to", + "icon": "📝", + "color": "#0a0a0a", + "prefix": "", + "supports_communities": false, + "communities": [ + { + "id": "top", + "name": "Top Posts", + "display_name": "Dev.to Top", + "icon": "⭐", + "description": "Most popular dev posts" + }, + { + "id": "latest", + "name": "Latest", + "display_name": "Dev.to Latest", + "icon": "🆕", + "description": "Recently published articles" + } + ] + }, + "stackoverflow": { + "name": "Stack Overflow", + "icon": "📚", + "color": "#f48024", + "prefix": "", + "supports_communities": false, + "communities": [ + { + "id": "featured", + "name": "Featured", + "display_name": "SO Featured", + "icon": "⭐", + "description": "Featured questions" + }, + { + "id": "newest", + "name": "Newest", + "display_name": "SO Newest", + "icon": "🆕", + "description": "Recent questions" + } + ] + }, + "rss": { + "name": "RSS Feeds", + "icon": "📡", + "color": "#ee802f", + "prefix": "", + "supports_communities": true, + "communities": [ + { + "id": "https://hnrss.org/frontpage", + "name": "HN RSS", + "display_name": "HN RSS Feed", + "icon": "🧮", + "description": "Hacker News front page via RSS" + }, + { + "id": "https://lobste.rs/rss", + "name": "Lobsters RSS", + "display_name": "Lobsters RSS Feed", + "icon": "🦞", + "description": "Lobsters community via RSS" + }, + { + "id": "https://www.reddit.com/r/programming/.rss", + "name": "r/programming RSS", + "display_name": "r/programming RSS", + "icon": "💻", + "description": "Reddit programming subreddit via RSS" + } + ] + } + }, + "collection_targets": [ + { + "platform": "reddit", + "community": "programming", + "max_posts": 75, + "priority": "high" + }, + { + "platform": "reddit", + "community": "python", + "max_posts": 75, + "priority": "high" + }, + { + "platform": "reddit", + "community": "javascript", + "max_posts": 50, + "priority": "medium" + }, + { + "platform": "reddit", + "community": "webdev", + "max_posts": 50, + "priority": "medium" + }, + { + "platform": "reddit", + "community": "technology", + "max_posts": 50, + "priority": "medium" + }, + { + "platform": "hackernews", + "community": "front_page", + "max_posts": 100, + "priority": "high" + }, + { + "platform": "hackernews", + "community": "ask", + "max_posts": 25, + "priority": "medium" + }, + { + "platform": "hackernews", + "community": "show", + "max_posts": 25, + "priority": "medium" + }, + { + "platform": "lobsters", + "community": "all", + "max_posts": 30, + "priority": "medium" + }, + { + "platform": "rss", + "community": "https://hnrss.org/frontpage", + "max_posts": 50, + "priority": "low" + } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1b1567f..2555dd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ sqlalchemy==2.0.36 authlib==1.3.2 APScheduler==3.10.4 praw==7.7.1 +feedparser==6.0.12