- Flask-based web application with PostgreSQL - User authentication and session management - Content moderation and filtering - Docker deployment with docker-compose - Admin interface for content management 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
624 lines
27 KiB
Python
624 lines
27 KiB
Python
import requests
|
|
import json
|
|
import datetime as dt
|
|
import time
|
|
from collections import defaultdict, deque
|
|
|
|
|
|
class RateLimiter:
|
|
"""
|
|
Simple rate limiter to prevent excessive API calls.
|
|
Tracks requests per domain and enforces delays.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.request_times = defaultdict(deque) # domain -> deque of timestamps
|
|
self.domain_limits = {
|
|
'reddit.com': {'requests': 60, 'window': 60}, # 60 requests per minute
|
|
'api.stackexchange.com': {'requests': 300, 'window': 86400}, # 300 per day
|
|
'hacker-news.firebaseio.com': {'requests': 300, 'window': 60}, # 300 per minute
|
|
'lobste.rs': {'requests': 30, 'window': 60}, # 30 per minute
|
|
'default': {'requests': 60, 'window': 60} # Default rate limit
|
|
}
|
|
|
|
def wait_if_needed(self, url: str):
|
|
"""
|
|
Check rate limit and wait if necessary before making request.
|
|
|
|
Args:
|
|
url: The URL being requested
|
|
"""
|
|
from urllib.parse import urlparse
|
|
|
|
domain = urlparse(url).netloc
|
|
current_time = time.time()
|
|
|
|
# Get rate limit for this domain
|
|
limit_config = self.domain_limits.get(domain, self.domain_limits['default'])
|
|
max_requests = limit_config['requests']
|
|
time_window = limit_config['window']
|
|
|
|
# Get request times for this domain
|
|
times = self.request_times[domain]
|
|
|
|
# Remove requests outside the time window
|
|
cutoff_time = current_time - time_window
|
|
while times and times[0] < cutoff_time:
|
|
times.popleft()
|
|
|
|
# Check if we're at the rate limit
|
|
if len(times) >= max_requests:
|
|
# Calculate how long to wait
|
|
oldest_request = times[0]
|
|
wait_time = time_window - (current_time - oldest_request)
|
|
|
|
if wait_time > 0:
|
|
import logging
|
|
logger = logging.getLogger(__name__)
|
|
logger.info(f"Rate limit reached for {domain}. Waiting {wait_time:.1f}s")
|
|
time.sleep(wait_time)
|
|
|
|
# Record this request
|
|
times.append(current_time)
|
|
|
|
|
|
# Global rate limiter instance
|
|
_rate_limiter = RateLimiter()
|
|
|
|
|
|
#a collection of static methods to grab reddit and reddit like data from various sources
|
|
class data_methods():
|
|
@staticmethod
|
|
def getData(platform, start_date, end_date, community, max_posts):
|
|
if platform == "reddit":
|
|
return data_methods.fetchers.getRedditData(start_date, end_date, community, max_posts)
|
|
elif platform == "pushshift":
|
|
return data_methods.fetchers.getPushshiftData(start_date, end_date, community, max_posts)
|
|
elif platform == "hackernews":
|
|
return data_methods.fetchers.getHackerNewsData(start_date, end_date, community, max_posts)
|
|
elif platform == "lobsters":
|
|
return data_methods.fetchers.getLobstersData(start_date, end_date, community, max_posts)
|
|
elif platform == "stackexchange":
|
|
return data_methods.fetchers.getStackExchangeData(start_date, end_date, community, max_posts)
|
|
else:
|
|
print("dataGrab.getData: platform not recognized")
|
|
return None
|
|
|
|
# ===== ATOMIC UTILITY FUNCTIONS =====
|
|
class utils():
|
|
"""Generic utility functions used across all fetchers"""
|
|
|
|
@staticmethod
|
|
def http_get_json(url, headers=None, params=None, timeout=30, max_retries=3):
|
|
"""
|
|
Generic HTTP GET request that returns JSON with comprehensive error handling.
|
|
|
|
Args:
|
|
url: Target URL
|
|
headers: HTTP headers
|
|
params: Query parameters
|
|
timeout: Request timeout in seconds
|
|
max_retries: Maximum number of retry attempts
|
|
|
|
Returns:
|
|
JSON response data
|
|
|
|
Raises:
|
|
requests.RequestException: On persistent failure after retries
|
|
"""
|
|
import time
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
# Add retry delay for subsequent attempts
|
|
if attempt > 0:
|
|
delay = min(2 ** attempt, 30) # Exponential backoff, max 30s
|
|
logger.info(f"Retrying request to {url} in {delay}s (attempt {attempt + 1}/{max_retries + 1})")
|
|
time.sleep(delay)
|
|
|
|
# Apply rate limiting before making the request
|
|
_rate_limiter.wait_if_needed(url)
|
|
|
|
response = requests.get(url, headers=headers, params=params, timeout=timeout)
|
|
|
|
# Handle different HTTP status codes
|
|
if response.status_code == 429: # Rate limited
|
|
retry_after = int(response.headers.get('Retry-After', 60))
|
|
if attempt < max_retries:
|
|
logger.warning(f"Rate limited. Waiting {retry_after}s before retry")
|
|
time.sleep(retry_after)
|
|
continue
|
|
|
|
response.raise_for_status()
|
|
|
|
# Validate JSON response
|
|
try:
|
|
json_data = response.json()
|
|
return json_data
|
|
except ValueError as e:
|
|
logger.error(f"Invalid JSON response from {url}: {e}")
|
|
if attempt < max_retries:
|
|
continue
|
|
raise requests.RequestException(f"Invalid JSON response: {e}")
|
|
|
|
except requests.exceptions.Timeout:
|
|
logger.warning(f"Request timeout for {url} (attempt {attempt + 1})")
|
|
if attempt == max_retries:
|
|
raise
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
logger.warning(f"Connection error for {url} (attempt {attempt + 1})")
|
|
if attempt == max_retries:
|
|
raise
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
# Don't retry on client errors (4xx) except rate limiting
|
|
if 400 <= e.response.status_code < 500 and e.response.status_code != 429:
|
|
logger.error(f"Client error {e.response.status_code} for {url}: {e}")
|
|
raise
|
|
logger.warning(f"HTTP error {e.response.status_code} for {url} (attempt {attempt + 1})")
|
|
if attempt == max_retries:
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error for {url}: {e}")
|
|
if attempt == max_retries:
|
|
raise
|
|
|
|
raise requests.RequestException(f"Failed to fetch {url} after {max_retries + 1} attempts")
|
|
|
|
@staticmethod
|
|
def filter_by_date_range(posts, start_date, end_date):
|
|
"""Filter posts by timestamp range"""
|
|
start_ts = int(dt.datetime.fromisoformat(start_date).timestamp())
|
|
end_ts = int(dt.datetime.fromisoformat(end_date).timestamp())
|
|
return [p for p in posts if p and start_ts <= p['timestamp'] <= end_ts]
|
|
|
|
@staticmethod
|
|
def convert_iso_to_timestamp(iso_string):
|
|
"""Convert ISO format datetime string to Unix timestamp"""
|
|
return int(dt.datetime.fromisoformat(iso_string.replace('Z', '+00:00')).timestamp())
|
|
|
|
# ===== URL AND PARAMETER BUILDERS =====
|
|
class builders():
|
|
"""Functions to build URLs, headers, and parameters"""
|
|
|
|
@staticmethod
|
|
def build_reddit_url(subreddit):
|
|
return f"https://www.reddit.com/r/{subreddit}/new.json"
|
|
|
|
@staticmethod
|
|
def build_reddit_headers():
|
|
return {'User-Agent': 'Mozilla/5.0 (compatible; DataCollector/1.0)'}
|
|
|
|
@staticmethod
|
|
def build_reddit_params(limit):
|
|
return {'limit': limit}
|
|
|
|
@staticmethod
|
|
def build_reddit_search_url(subreddit, start_date, end_date):
|
|
"""Build Reddit search URL for time-based queries"""
|
|
return f"https://www.reddit.com/r/{subreddit}/search.json"
|
|
|
|
@staticmethod
|
|
def build_reddit_search_params(limit, start_date, end_date):
|
|
"""Build search parameters for Reddit API with time constraints"""
|
|
import datetime
|
|
|
|
# Convert date strings to timestamps for Reddit API
|
|
try:
|
|
start_ts = int(datetime.datetime.fromisoformat(start_date.replace('Z', '+00:00')).timestamp())
|
|
end_ts = int(datetime.datetime.fromisoformat(end_date.replace('Z', '+00:00')).timestamp())
|
|
|
|
# Use Reddit's search syntax for time-based queries
|
|
# Reddit search uses 'after:' and 'before:' with timestamps
|
|
query = f"after:{start_ts} before:{end_ts}"
|
|
|
|
return {
|
|
'q': query,
|
|
'sort': 'new',
|
|
'restrict_sr': 'true', # Restrict to subreddit
|
|
'limit': limit,
|
|
't': 'all' # Time period: all
|
|
}
|
|
except (ValueError, TypeError):
|
|
# Fallback to simple search without time constraints
|
|
return {
|
|
'q': '*', # Match all posts
|
|
'sort': 'new',
|
|
'restrict_sr': 'true',
|
|
'limit': limit,
|
|
't': 'week' # Default to past week
|
|
}
|
|
|
|
@staticmethod
|
|
def build_hackernews_top_stories_url():
|
|
return "https://hacker-news.firebaseio.com/v0/topstories.json"
|
|
|
|
@staticmethod
|
|
def build_hackernews_story_url(story_id):
|
|
return f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
|
|
|
|
@staticmethod
|
|
def build_lobsters_url():
|
|
return "https://lobste.rs/hottest.json"
|
|
|
|
@staticmethod
|
|
def build_stackexchange_url():
|
|
return f"https://api.stackexchange.com/2.3/questions"
|
|
|
|
@staticmethod
|
|
def build_stackexchange_params(site, limit, start_date, end_date):
|
|
start_ts = int(dt.datetime.fromisoformat(start_date).timestamp())
|
|
end_ts = int(dt.datetime.fromisoformat(end_date).timestamp())
|
|
return {
|
|
'site': site,
|
|
'pagesize': limit,
|
|
'fromdate': start_ts,
|
|
'todate': end_ts,
|
|
'sort': 'votes',
|
|
'order': 'desc'
|
|
}
|
|
|
|
# ===== SCHEMA CONVERTERS =====
|
|
class converters():
|
|
"""Functions to convert platform-specific data to unified schema"""
|
|
|
|
@staticmethod
|
|
def reddit_to_schema(child):
|
|
post = child['data']
|
|
return {
|
|
'platform': 'reddit',
|
|
'id': post.get('id'),
|
|
'title': post.get('title'),
|
|
'author': post.get('author'),
|
|
'timestamp': int(post.get('created_utc', 0)),
|
|
'score': post.get('score', 0),
|
|
'replies': post.get('num_comments', 0),
|
|
'url': post.get('url'),
|
|
'content': post.get('selftext', ''),
|
|
'source': post.get('subreddit'),
|
|
'tags': [post.get('link_flair_text', '')],
|
|
'meta': {'is_self': post.get('is_self', False)}
|
|
}
|
|
|
|
@staticmethod
|
|
def hackernews_to_schema(raw):
|
|
if not raw or raw.get('type') != 'story':
|
|
return None
|
|
return {
|
|
'platform': 'hackernews',
|
|
'id': f"hn_{raw.get('id')}",
|
|
'title': raw.get('title'),
|
|
'author': raw.get('by', 'unknown'),
|
|
'timestamp': int(raw.get('time', 0)),
|
|
'score': raw.get('score', 0),
|
|
'replies': raw.get('descendants', 0),
|
|
'url': raw.get('url', f"https://news.ycombinator.com/item?id={raw.get('id')}"),
|
|
'content': raw.get('text', ''),
|
|
'source': 'hackernews',
|
|
'tags': ['hackernews'],
|
|
'meta': {}
|
|
}
|
|
|
|
@staticmethod
|
|
def lobsters_to_schema(raw):
|
|
submitter = raw.get('submitter_user', 'unknown')
|
|
author = submitter.get('username', 'unknown') if isinstance(submitter, dict) else submitter
|
|
return {
|
|
'platform': 'lobsters',
|
|
'id': f"lob_{raw.get('short_id')}",
|
|
'title': raw.get('title'),
|
|
'author': author,
|
|
'timestamp': data_methods.utils.convert_iso_to_timestamp(raw.get('created_at')),
|
|
'score': raw.get('score', 0),
|
|
'replies': raw.get('comment_count', 0),
|
|
'url': raw.get('url', raw.get('comments_url')),
|
|
'content': raw.get('description', ''),
|
|
'source': 'lobsters',
|
|
'tags': raw.get('tags', []),
|
|
'meta': {}
|
|
}
|
|
|
|
@staticmethod
|
|
def stackexchange_to_schema(raw, community):
|
|
return {
|
|
'platform': 'stackexchange',
|
|
'id': f"se_{raw.get('question_id')}",
|
|
'title': raw.get('title'),
|
|
'author': raw.get('owner', {}).get('display_name', 'unknown'),
|
|
'timestamp': int(raw.get('creation_date', 0)),
|
|
'score': raw.get('score', 0),
|
|
'replies': raw.get('answer_count', 0),
|
|
'url': raw.get('link'),
|
|
'content': '',
|
|
'source': community,
|
|
'tags': raw.get('tags', []),
|
|
'meta': {'view_count': raw.get('view_count', 0)}
|
|
}
|
|
|
|
# ===== COMMENT FETCHERS =====
|
|
class comment_fetchers():
|
|
"""Functions to fetch comments for posts from various platforms"""
|
|
|
|
@staticmethod
|
|
def fetch_reddit_comments(post_id, subreddit, max_comments=50):
|
|
"""
|
|
Fetch comments for a Reddit post.
|
|
Note: Reddit JSON API has limited comment support without auth.
|
|
Returns list of comment dicts with parent relationships.
|
|
"""
|
|
# Reddit comment API: /r/{subreddit}/comments/{post_id}.json
|
|
url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json"
|
|
headers = {'User-Agent': 'Mozilla/5.0 (compatible; DataCollector/1.0)'}
|
|
|
|
try:
|
|
raw = data_methods.utils.http_get_json(url, headers=headers)
|
|
|
|
# Reddit returns [post_data, comments_data]
|
|
if len(raw) < 2:
|
|
return []
|
|
|
|
comments_data = raw[1]['data']['children']
|
|
comments = []
|
|
|
|
def extract_comment(comment_obj, parent_id=None, depth=0):
|
|
if comment_obj['kind'] != 't1': # t1 = comment
|
|
return
|
|
|
|
data = comment_obj['data']
|
|
comments.append({
|
|
'id': data.get('id'),
|
|
'parent_comment_id': parent_id,
|
|
'author': data.get('author', '[deleted]'),
|
|
'content': data.get('body', ''),
|
|
'timestamp': int(data.get('created_utc', 0)),
|
|
'score': data.get('score', 0),
|
|
'depth': depth
|
|
})
|
|
|
|
# Process replies
|
|
if 'replies' in data and isinstance(data['replies'], dict):
|
|
for reply in data['replies']['data']['children']:
|
|
extract_comment(reply, data.get('id'), depth + 1)
|
|
|
|
# Extract all comments
|
|
for comment_obj in comments_data:
|
|
extract_comment(comment_obj, None, 0)
|
|
|
|
return comments[:max_comments]
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching Reddit comments: {e}")
|
|
return []
|
|
|
|
@staticmethod
|
|
def fetch_hackernews_comments(story_id, max_comments=50):
|
|
"""
|
|
Fetch comments for a HackerNews story.
|
|
HN provides comment IDs in the 'kids' field.
|
|
"""
|
|
comments = []
|
|
|
|
def fetch_comment_recursive(comment_id, parent_id=None, depth=0):
|
|
if len(comments) >= max_comments:
|
|
return
|
|
|
|
url = f"https://hacker-news.firebaseio.com/v0/item/{comment_id}.json"
|
|
try:
|
|
raw = data_methods.utils.http_get_json(url)
|
|
|
|
if not raw or raw.get('deleted') or raw.get('dead'):
|
|
return
|
|
|
|
comments.append({
|
|
'id': str(raw.get('id')),
|
|
'parent_comment_id': parent_id,
|
|
'author': raw.get('by', 'unknown'),
|
|
'content': raw.get('text', ''),
|
|
'timestamp': int(raw.get('time', 0)),
|
|
'score': 0, # HN doesn't provide comment scores via API
|
|
'depth': depth
|
|
})
|
|
|
|
# Fetch child comments
|
|
if 'kids' in raw:
|
|
for kid_id in raw['kids'][:5]: # Limit children
|
|
fetch_comment_recursive(kid_id, str(raw.get('id')), depth + 1)
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching HN comment {comment_id}: {e}")
|
|
|
|
# Start with top-level comment IDs from story
|
|
try:
|
|
story_url = f"https://hacker-news.firebaseio.com/v0/item/{story_id}.json"
|
|
story = data_methods.utils.http_get_json(story_url)
|
|
|
|
if 'kids' in story:
|
|
for kid_id in story['kids'][:10]: # Limit top-level comments
|
|
fetch_comment_recursive(kid_id, None, 0)
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching HN story for comments: {e}")
|
|
|
|
return comments
|
|
|
|
@staticmethod
|
|
def fetch_lobsters_comments(story_id):
|
|
"""
|
|
Lobsters provides comments in the story JSON.
|
|
"""
|
|
# Lobsters API doesn't easily provide comment trees
|
|
# Would need to parse HTML or use authenticated API
|
|
return []
|
|
|
|
@staticmethod
|
|
def fetch_stackexchange_comments(question_id, site='stackoverflow'):
|
|
"""
|
|
Fetch comments for a StackExchange question and its answers.
|
|
Uses the public StackExchange API v2.3.
|
|
"""
|
|
import datetime
|
|
comments = []
|
|
|
|
try:
|
|
# First, get question comments
|
|
question_comments_url = f"https://api.stackexchange.com/2.3/questions/{question_id}/comments"
|
|
params = {
|
|
'site': site,
|
|
'filter': 'default', # Includes basic comment data
|
|
'page': 1,
|
|
'pagesize': 100
|
|
}
|
|
|
|
response = data_methods.utils.http_get_json(question_comments_url, params=params)
|
|
if response and 'items' in response:
|
|
for comment in response['items']:
|
|
comments.append({
|
|
'uuid': f"se_{site}_{comment['comment_id']}",
|
|
'platform': 'stackexchange',
|
|
'source': site,
|
|
'content': comment.get('body', ''),
|
|
'author': comment.get('owner', {}).get('display_name', 'Anonymous'),
|
|
'timestamp': datetime.datetime.fromtimestamp(
|
|
comment.get('creation_date', 0)
|
|
).isoformat() + 'Z',
|
|
'score': comment.get('score', 0),
|
|
'parent_post_id': str(question_id),
|
|
'parent_comment_uuid': None, # Top-level comment
|
|
'depth': 0,
|
|
'se_comment_id': comment['comment_id'],
|
|
'se_post_id': comment.get('post_id'),
|
|
'se_post_type': comment.get('post_type', 'question')
|
|
})
|
|
|
|
# Then get answer IDs for this question
|
|
answers_url = f"https://api.stackexchange.com/2.3/questions/{question_id}/answers"
|
|
answers_params = {
|
|
'site': site,
|
|
'filter': 'default',
|
|
'page': 1,
|
|
'pagesize': 50
|
|
}
|
|
|
|
answers_response = data_methods.utils.http_get_json(answers_url, params=answers_params)
|
|
if answers_response and 'items' in answers_response:
|
|
# Get comments for each answer
|
|
for answer in answers_response['items']:
|
|
answer_id = answer['answer_id']
|
|
answer_comments_url = f"https://api.stackexchange.com/2.3/answers/{answer_id}/comments"
|
|
|
|
answer_comments_response = data_methods.utils.http_get_json(answer_comments_url, params=params)
|
|
if answer_comments_response and 'items' in answer_comments_response:
|
|
for comment in answer_comments_response['items']:
|
|
comments.append({
|
|
'uuid': f"se_{site}_{comment['comment_id']}",
|
|
'platform': 'stackexchange',
|
|
'source': site,
|
|
'content': comment.get('body', ''),
|
|
'author': comment.get('owner', {}).get('display_name', 'Anonymous'),
|
|
'timestamp': datetime.datetime.fromtimestamp(
|
|
comment.get('creation_date', 0)
|
|
).isoformat() + 'Z',
|
|
'score': comment.get('score', 0),
|
|
'parent_post_id': str(answer_id),
|
|
'parent_comment_uuid': None, # SE comments are flat
|
|
'depth': 0,
|
|
'se_comment_id': comment['comment_id'],
|
|
'se_post_id': comment.get('post_id'),
|
|
'se_post_type': comment.get('post_type', 'answer')
|
|
})
|
|
|
|
return comments[:100] # Limit total comments
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching StackExchange comments for {question_id} on {site}: {e}")
|
|
return []
|
|
|
|
# ===== PLATFORM FETCHERS (ORCHESTRATION) =====
|
|
class fetchers():
|
|
"""Orchestration functions that compose atomic functions"""
|
|
|
|
@staticmethod
|
|
def getRedditData(start_date, end_date, community, max_posts):
|
|
# Build request components
|
|
url = data_methods.builders.build_reddit_url(community)
|
|
headers = data_methods.builders.build_reddit_headers()
|
|
params = data_methods.builders.build_reddit_params(max_posts)
|
|
|
|
# Fetch and extract
|
|
raw = data_methods.utils.http_get_json(url, headers, params)
|
|
children = raw['data']['children']
|
|
|
|
# Convert and filter
|
|
posts = [data_methods.converters.reddit_to_schema(c) for c in children]
|
|
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
|
|
|
@staticmethod
|
|
def getPushshiftData(start_date, end_date, community, max_posts):
|
|
"""
|
|
Alternative Reddit data collection using official Reddit API.
|
|
Since Pushshift is deprecated, we use Reddit's native search/listing endpoints.
|
|
"""
|
|
try:
|
|
# Use Reddit's native search for historical posts within date range
|
|
# Build search URL for the specific subreddit and time range
|
|
url = data_methods.builders.build_reddit_search_url(community, start_date, end_date)
|
|
headers = data_methods.builders.build_reddit_headers()
|
|
params = data_methods.builders.build_reddit_search_params(max_posts, start_date, end_date)
|
|
|
|
# Fetch data from Reddit search
|
|
raw = data_methods.utils.http_get_json(url, headers, params)
|
|
|
|
if not raw or 'data' not in raw or 'children' not in raw['data']:
|
|
return []
|
|
|
|
children = raw['data']['children']
|
|
|
|
# Convert and filter by date range
|
|
posts = [data_methods.converters.reddit_to_schema(c) for c in children]
|
|
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching Reddit search data: {e}")
|
|
return []
|
|
|
|
@staticmethod
|
|
def getHackerNewsData(start_date, end_date, community, max_posts):
|
|
# Fetch story IDs
|
|
ids_url = data_methods.builders.build_hackernews_top_stories_url()
|
|
ids = data_methods.utils.http_get_json(ids_url)[:max_posts]
|
|
|
|
# Fetch individual stories
|
|
stories = []
|
|
for story_id in ids:
|
|
story_url = data_methods.builders.build_hackernews_story_url(story_id)
|
|
stories.append(data_methods.utils.http_get_json(story_url))
|
|
|
|
# Convert and filter
|
|
posts = [data_methods.converters.hackernews_to_schema(s) for s in stories]
|
|
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
|
|
|
@staticmethod
|
|
def getLobstersData(start_date, end_date, community, max_posts):
|
|
# Fetch posts
|
|
url = data_methods.builders.build_lobsters_url()
|
|
raw = data_methods.utils.http_get_json(url)[:max_posts]
|
|
|
|
# Convert and filter
|
|
posts = [data_methods.converters.lobsters_to_schema(r) for r in raw]
|
|
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
|
|
|
|
@staticmethod
|
|
def getStackExchangeData(start_date, end_date, community, max_posts):
|
|
# Build request components
|
|
url = data_methods.builders.build_stackexchange_url()
|
|
params = data_methods.builders.build_stackexchange_params(community, max_posts, start_date, end_date)
|
|
|
|
# Fetch and convert
|
|
raw = data_methods.utils.http_get_json(url, params=params)
|
|
return [data_methods.converters.stackexchange_to_schema(q, community) for q in raw.get('items', [])]
|