Initial commit: BalanceBoard - Reddit-style content aggregator

- Flask-based web application with PostgreSQL
- User authentication and session management
- Content moderation and filtering
- Docker deployment with docker-compose
- Admin interface for content management

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-11 16:11:13 -05:00
commit e821a26b48
35 changed files with 10736 additions and 0 deletions

215
polling_service.py Normal file
View File

@@ -0,0 +1,215 @@
"""
Polling Service
Background service for collecting data from configured sources.
"""
import logging
import traceback
from datetime import datetime
from pathlib import Path
from typing import Dict, List
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.interval import IntervalTrigger
from database import db
from models import PollSource, PollLog
from data_collection import collect_platform, get_collection_sources, load_platform_config
logger = logging.getLogger(__name__)
class PollingService:
"""Background polling service using APScheduler"""
def __init__(self, app=None):
self.scheduler = BackgroundScheduler()
self.app = app
self.storage_dir = 'data'
def init_app(self, app):
"""Initialize with Flask app"""
self.app = app
self.storage_dir = app.config.get('POLL_STORAGE_DIR', 'data')
def start(self):
"""Start the scheduler"""
if not self.scheduler.running:
self.scheduler.start()
logger.info("Polling scheduler started")
# Schedule the poll checker to run every minute
self.scheduler.add_job(
func=self._check_and_poll,
trigger=IntervalTrigger(minutes=1),
id='poll_checker',
name='Check and poll sources',
replace_existing=True
)
logger.info("Poll checker job scheduled")
def stop(self):
"""Stop the scheduler"""
if self.scheduler.running:
self.scheduler.shutdown()
logger.info("Polling scheduler stopped")
def _check_and_poll(self):
"""Check which sources need polling and poll them"""
if not self.app:
logger.error("No app context available")
return
with self.app.app_context():
try:
# Get all enabled sources
sources = PollSource.query.filter_by(enabled=True).all()
for source in sources:
# Check if source needs polling
if self._should_poll(source):
self._poll_source(source)
except Exception as e:
logger.error(f"Error in poll checker: {e}")
logger.error(traceback.format_exc())
def _should_poll(self, source: PollSource) -> bool:
"""Determine if a source should be polled now"""
if not source.last_poll_time:
# Never polled, should poll
return True
# Calculate time since last poll
time_since_poll = datetime.utcnow() - source.last_poll_time
minutes_since_poll = time_since_poll.total_seconds() / 60
# Poll if interval has elapsed
return minutes_since_poll >= source.poll_interval_minutes
def _poll_source(self, source: PollSource):
"""Poll a single source"""
logger.info(f"Polling {source.platform}:{source.source_id}")
# Create poll log
poll_log = PollLog(
source_id=source.id,
started_at=datetime.utcnow(),
status='running'
)
db.session.add(poll_log)
db.session.commit()
try:
# Perform the actual data collection
result = self._collect_data(source)
# Update poll log
poll_log.completed_at = datetime.utcnow()
poll_log.status = 'success'
poll_log.posts_found = result.get('posts_found', 0)
poll_log.posts_new = result.get('posts_new', 0)
poll_log.posts_updated = result.get('posts_updated', 0)
# Update source
source.last_poll_time = datetime.utcnow()
source.last_poll_status = 'success'
source.last_poll_error = None
source.posts_collected += result.get('posts_new', 0)
db.session.commit()
logger.info(f"Polling completed for {source.platform}:{source.source_id} - "
f"{result.get('posts_new', 0)} new posts")
except Exception as e:
error_msg = str(e)
error_trace = traceback.format_exc()
logger.error(f"Error polling {source.platform}:{source.source_id}: {error_msg}")
logger.error(error_trace)
# Update poll log
poll_log.completed_at = datetime.utcnow()
poll_log.status = 'error'
poll_log.error_message = f"{error_msg}\n\n{error_trace}"
# Update source
source.last_poll_time = datetime.utcnow()
source.last_poll_status = 'error'
source.last_poll_error = error_msg
db.session.commit()
def _collect_data(self, source: PollSource) -> Dict:
"""
Collect data from a source.
Wraps the existing data_collection.py functionality.
"""
from data_collection import ensure_directories, load_index, save_index, calculate_date_range, load_state, save_state
# Setup directories and load state
dirs = ensure_directories(self.storage_dir)
index = load_index(self.storage_dir)
state = load_state(self.storage_dir)
# Calculate date range (collect last 1 day)
start_iso, end_iso = calculate_date_range(1, state)
try:
# Call the existing collect_platform function
posts_collected = collect_platform(
platform=source.platform,
community=source.source_id,
start_date=start_iso,
end_date=end_iso,
max_posts=100, # Default limit
fetch_comments=True,
index=index,
dirs=dirs
)
# Save updated index and state
save_index(index, self.storage_dir)
state['last_run'] = end_iso
save_state(state, self.storage_dir)
return {
'posts_found': posts_collected,
'posts_new': posts_collected,
'posts_updated': 0
}
except Exception as e:
logger.error(f"Error in _collect_data: {e}")
return {
'posts_found': 0,
'posts_new': 0,
'posts_updated': 0
}
def poll_now(self, source_id: str):
"""Manually trigger polling for a specific source"""
with self.app.app_context():
source = PollSource.query.get(source_id)
if source:
self._poll_source(source)
return True
return False
def get_status(self) -> Dict:
"""Get scheduler status"""
return {
'running': self.scheduler.running,
'jobs': [
{
'id': job.id,
'name': job.name,
'next_run': job.next_run_time.isoformat() if job.next_run_time else None
}
for job in self.scheduler.get_jobs()
]
}
# Global polling service instance
polling_service = PollingService()