397 lines
12 KiB
Python
397 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data Collection Script
|
|
Collects posts and comments from multiple platforms with UUID-based storage.
|
|
Functional approach - no classes, just functions.
|
|
"""
|
|
|
|
import json
|
|
import uuid
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import List, Dict, Tuple
|
|
from data_collection_lib import data_methods
|
|
|
|
|
|
# ===== STORAGE FUNCTIONS =====
|
|
|
|
def ensure_directories(storage_dir: str) -> Dict[str, Path]:
|
|
"""Create and return directory paths"""
|
|
base = Path(storage_dir)
|
|
|
|
dirs = {
|
|
'posts': base / 'posts',
|
|
'comments': base / 'comments',
|
|
'moderation': base / 'moderation',
|
|
'base': base
|
|
}
|
|
|
|
for path in dirs.values():
|
|
path.mkdir(parents=True, exist_ok=True)
|
|
|
|
return dirs
|
|
|
|
|
|
def load_index(storage_dir: str) -> Dict:
|
|
"""Load post index from disk"""
|
|
index_file = Path(storage_dir) / 'post_index.json'
|
|
|
|
if index_file.exists():
|
|
with open(index_file, 'r') as f:
|
|
index = json.load(f)
|
|
print(f"Loaded index with {len(index)} posts")
|
|
return index
|
|
|
|
return {}
|
|
|
|
|
|
def save_index(index: Dict, storage_dir: str):
|
|
"""Save post index to disk"""
|
|
index_file = Path(storage_dir) / 'post_index.json'
|
|
with open(index_file, 'w') as f:
|
|
json.dump(index, f, indent=2)
|
|
|
|
|
|
def load_state(storage_dir: str) -> Dict:
|
|
"""Load collection state from disk"""
|
|
state_file = Path(storage_dir) / 'collection_state.json'
|
|
|
|
if state_file.exists():
|
|
with open(state_file, 'r') as f:
|
|
state = json.load(f)
|
|
print(f"Loaded collection state: {state.get('last_run', 'never')}")
|
|
return state
|
|
|
|
return {}
|
|
|
|
|
|
def save_state(state: Dict, storage_dir: str):
|
|
"""Save collection state to disk"""
|
|
state_file = Path(storage_dir) / 'collection_state.json'
|
|
with open(state_file, 'w') as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
|
|
def generate_uuid() -> str:
|
|
"""Generate a new UUID"""
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
# ===== MODERATION FUNCTIONS =====
|
|
|
|
def create_moderation_stub(target_id: str, target_type: str, dirs: Dict) -> str:
|
|
"""Create moderation stub file and return UUID"""
|
|
mod_uuid = generate_uuid()
|
|
|
|
moderation_data = {
|
|
"target_id": target_id,
|
|
"target_type": target_type,
|
|
"analyzed_at": int(datetime.now().timestamp()),
|
|
"model_version": "stub-1.0",
|
|
"flags": {
|
|
"requires_review": False,
|
|
"is_blocked": False,
|
|
"is_flagged": False,
|
|
"is_safe": True
|
|
}
|
|
}
|
|
|
|
mod_file = dirs['moderation'] / f"{mod_uuid}.json"
|
|
with open(mod_file, 'w') as f:
|
|
json.dump(moderation_data, f, indent=2)
|
|
|
|
return mod_uuid
|
|
|
|
|
|
# ===== POST FUNCTIONS =====
|
|
|
|
def save_post(post: Dict, platform: str, index: Dict, dirs: Dict) -> str:
|
|
"""Save post to UUID-based file, return UUID"""
|
|
post_id = f"{platform}_{post['id']}"
|
|
|
|
# Check if already exists
|
|
if post_id in index:
|
|
return index[post_id]
|
|
|
|
# Generate UUID and save
|
|
post_uuid = generate_uuid()
|
|
post['uuid'] = post_uuid
|
|
post['moderation_uuid'] = create_moderation_stub(post_id, 'post', dirs)
|
|
|
|
post_file = dirs['posts'] / f"{post_uuid}.json"
|
|
with open(post_file, 'w') as f:
|
|
json.dump(post, f, indent=2)
|
|
|
|
# Update index
|
|
index[post_id] = post_uuid
|
|
|
|
return post_uuid
|
|
|
|
|
|
# ===== COMMENT FUNCTIONS =====
|
|
|
|
def save_comment(comment: Dict, post_uuid: str, platform: str, dirs: Dict) -> str:
|
|
"""Save comment to UUID-based file, return UUID"""
|
|
comment_uuid = generate_uuid()
|
|
|
|
comment['uuid'] = comment_uuid
|
|
comment['post_uuid'] = post_uuid
|
|
comment['platform'] = platform
|
|
comment['moderation_uuid'] = create_moderation_stub(
|
|
f"{platform}_comment_{comment['id']}",
|
|
'comment',
|
|
dirs
|
|
)
|
|
|
|
comment_file = dirs['comments'] / f"{comment_uuid}.json"
|
|
with open(comment_file, 'w') as f:
|
|
json.dump(comment, f, indent=2)
|
|
|
|
return comment_uuid
|
|
|
|
|
|
def fetch_and_save_comments(post: Dict, platform: str, dirs: Dict, max_comments: int = 50) -> List[str]:
|
|
"""Fetch comments for post and save them, return list of UUIDs"""
|
|
comments = []
|
|
post_id = post.get('id')
|
|
|
|
# Fetch comments based on platform
|
|
if platform == 'reddit':
|
|
source = post.get('source', '').replace('r/', '')
|
|
comments = data_methods.comment_fetchers.fetch_reddit_comments(post_id, source, max_comments)
|
|
elif platform == 'hackernews':
|
|
if post_id.startswith('hn_'):
|
|
story_id = post_id[3:]
|
|
comments = data_methods.comment_fetchers.fetch_hackernews_comments(story_id, max_comments)
|
|
|
|
# Save comments with parent UUID mapping
|
|
comment_uuid_map = {}
|
|
comment_uuids = []
|
|
post_uuid = post.get('uuid')
|
|
|
|
for comment in comments:
|
|
# Map parent ID to UUID
|
|
parent_id = comment.get('parent_comment_id')
|
|
if parent_id and parent_id in comment_uuid_map:
|
|
comment['parent_comment_uuid'] = comment_uuid_map[parent_id]
|
|
else:
|
|
comment['parent_comment_uuid'] = None
|
|
|
|
# Save comment
|
|
comment_uuid = save_comment(comment, post_uuid, platform, dirs)
|
|
comment_uuid_map[comment['id']] = comment_uuid
|
|
comment_uuids.append(comment_uuid)
|
|
|
|
return comment_uuids
|
|
|
|
|
|
# ===== COLLECTION FUNCTIONS =====
|
|
|
|
def collect_platform(platform: str, community: str, start_date: str, end_date: str,
|
|
max_posts: int, fetch_comments: bool, index: Dict, dirs: Dict) -> int:
|
|
"""Collect posts and comments from a platform, return count of new posts"""
|
|
print(f"\nCollecting from {platform}" + (f"/{community}" if community else ""))
|
|
|
|
try:
|
|
# Fetch posts
|
|
new_posts = data_methods.getData(platform, start_date, end_date, community, max_posts)
|
|
|
|
if not new_posts:
|
|
print(f" No posts retrieved")
|
|
return 0
|
|
|
|
print(f" Retrieved {len(new_posts)} posts")
|
|
|
|
# Process each post
|
|
added_count = 0
|
|
for post in new_posts:
|
|
post_id = f"{platform}_{post['id']}"
|
|
|
|
# Skip if already collected
|
|
if post_id in index:
|
|
continue
|
|
|
|
# ====================================================================
|
|
# FIX: Correct the post's source field BEFORE saving
|
|
# ====================================================================
|
|
post['source'] = community if community else platform
|
|
# ====================================================================
|
|
|
|
# Save post
|
|
post_uuid = save_post(post, platform, index, dirs)
|
|
added_count += 1
|
|
|
|
# Fetch and save comments
|
|
if fetch_comments:
|
|
comment_uuids = fetch_and_save_comments(post, platform, dirs)
|
|
if comment_uuids:
|
|
print(f" Post {post['id']}: saved {len(comment_uuids)} comments")
|
|
|
|
if added_count > 0:
|
|
print(f" Added {added_count} new posts")
|
|
|
|
return added_count
|
|
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return 0
|
|
|
|
|
|
def calculate_date_range(days_back: int, state: Dict) -> Tuple[str, str]:
|
|
"""Calculate start and end dates for collection, considering resume"""
|
|
end_date = datetime.now()
|
|
start_date = end_date - timedelta(days=days_back)
|
|
|
|
# Resume from last run if recent
|
|
if state.get('last_run'):
|
|
last_run = datetime.fromisoformat(state['last_run'])
|
|
if (end_date - last_run).total_seconds() < 3600: # Less than 1 hour ago
|
|
print(f"Last run was {last_run.isoformat()}, resuming from that point")
|
|
start_date = last_run
|
|
|
|
return start_date.isoformat(), end_date.isoformat()
|
|
|
|
|
|
def collect_batch(sources: List[Dict], storage_dir: str, days_back: int = 1, fetch_comments: bool = True):
|
|
"""Main collection function - orchestrates everything"""
|
|
|
|
# Setup
|
|
dirs = ensure_directories(storage_dir)
|
|
index = load_index(storage_dir)
|
|
state = load_state(storage_dir)
|
|
|
|
# Calculate date range
|
|
start_iso, end_iso = calculate_date_range(days_back, state)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Collection Period: {start_iso} to {end_iso}")
|
|
print(f"Fetch comments: {fetch_comments}")
|
|
print(f"{'='*60}")
|
|
|
|
# Collect from each source
|
|
total_new = 0
|
|
for source in sources:
|
|
platform = source['platform']
|
|
community = source.get('community', '')
|
|
max_posts = source.get('max_posts', 100)
|
|
|
|
count = collect_platform(
|
|
platform, community, start_iso, end_iso,
|
|
max_posts, fetch_comments, index, dirs
|
|
)
|
|
total_new += count
|
|
|
|
# Update and save state
|
|
state['last_run'] = end_iso
|
|
state['total_posts'] = len(index)
|
|
state['last_batch_count'] = total_new
|
|
|
|
save_index(index, storage_dir)
|
|
save_state(state, storage_dir)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Collection Complete")
|
|
print(f" New posts this run: {total_new}")
|
|
print(f" Total posts in stash: {len(index)}")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
def get_stats(storage_dir: str) -> Dict:
|
|
"""Get collection statistics"""
|
|
dirs = ensure_directories(storage_dir)
|
|
index = load_index(storage_dir)
|
|
state = load_state(storage_dir)
|
|
|
|
post_count = len(list(dirs['posts'].glob('*.json')))
|
|
comment_count = len(list(dirs['comments'].glob('*.json')))
|
|
moderation_count = len(list(dirs['moderation'].glob('*.json')))
|
|
|
|
return {
|
|
'total_posts': post_count,
|
|
'total_comments': comment_count,
|
|
'total_moderation_records': moderation_count,
|
|
'index_entries': len(index),
|
|
'last_run': state.get('last_run', 'never'),
|
|
'storage_dir': storage_dir
|
|
}
|
|
|
|
|
|
def print_stats(storage_dir: str):
|
|
"""Print collection statistics"""
|
|
stats = get_stats(storage_dir)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Collection Statistics")
|
|
print(f"{'='*60}")
|
|
print(f"Total posts: {stats['total_posts']}")
|
|
print(f"Total comments: {stats['total_comments']}")
|
|
print(f"Total moderation records: {stats['total_moderation_records']}")
|
|
print(f"Index entries: {stats['index_entries']}")
|
|
print(f"Last run: {stats['last_run']}")
|
|
print(f"Storage: {stats['storage_dir']}")
|
|
print(f"{'='*60}\n")
|
|
|
|
|
|
# ===== MAIN ENTRY POINT =====
|
|
|
|
def load_platform_config(config_file: str = "./platform_config.json") -> Dict:
|
|
"""Load platform configuration from JSON file"""
|
|
try:
|
|
with open(config_file, 'r') as f:
|
|
return json.load(f)
|
|
except Exception as e:
|
|
print(f"Error loading platform config: {e}")
|
|
# Return minimal fallback config
|
|
return {
|
|
"collection_targets": [
|
|
{'platform': 'reddit', 'community': 'python', 'max_posts': 50, 'priority': 'high'},
|
|
{'platform': 'reddit', 'community': 'programming', 'max_posts': 50, 'priority': 'high'},
|
|
{'platform': 'hackernews', 'community': 'front_page', 'max_posts': 50, 'priority': 'high'},
|
|
]
|
|
}
|
|
|
|
|
|
def get_collection_sources(config: Dict, priority_filter: str = None) -> List[Dict]:
|
|
"""Extract collection sources from platform config, optionally filtered by priority"""
|
|
sources = []
|
|
|
|
for target in config.get('collection_targets', []):
|
|
# Apply priority filter if specified
|
|
if priority_filter and target.get('priority') != priority_filter:
|
|
continue
|
|
|
|
sources.append({
|
|
'platform': target['platform'],
|
|
'community': target['community'],
|
|
'max_posts': target['max_posts']
|
|
})
|
|
|
|
return sources
|
|
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
storage_dir = "./data"
|
|
|
|
# Load platform configuration
|
|
platform_config = load_platform_config()
|
|
|
|
# Get collection sources (all priorities for comprehensive collection)
|
|
sources = get_collection_sources(platform_config)
|
|
|
|
print(f"Loaded {len(sources)} collection targets from platform configuration")
|
|
for source in sources:
|
|
print(f" - {source['platform']}/{source['community']}: {source['max_posts']} posts")
|
|
|
|
# Collect posts and comments
|
|
collect_batch(sources, storage_dir, days_back=1, fetch_comments=True)
|
|
|
|
# Print statistics
|
|
print_stats(storage_dir)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|