Add filter pipeline core infrastructure (Phase 1)
Implements plugin-based content filtering system with multi-level caching: Core Components: - FilterEngine: Main orchestrator for content filtering - FilterCache: 3-level caching (memory, AI results, filterset results) - FilterConfig: Configuration loader for filter_config.json & filtersets.json - FilterResult & AIAnalysisResult: Data models for filter results Architecture: - BaseStage: Abstract class for pipeline stages - BaseFilterPlugin: Abstract class for filter plugins - Multi-threaded parallel processing support - Content-hash based AI result caching (cost savings) - Filterset result caching (fast filterset switching) Configuration: - filter_config.json: AI models, caching, parallel workers - Using only Llama 70B for cost efficiency - Compatible with existing filtersets.json Integration: - apply_filterset() API compatible with user preferences - process_batch() for batch post processing - Lazy-loaded stages to avoid import errors when AI disabled Related to issue #8 (filtering engine implementation) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
27
filter_config.json
Normal file
27
filter_config.json
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"ai": {
|
||||
"enabled": false,
|
||||
"openrouter_key_file": "openrouter_key.txt",
|
||||
"models": {
|
||||
"cheap": "meta-llama/llama-3.3-70b-instruct",
|
||||
"smart": "meta-llama/llama-3.3-70b-instruct"
|
||||
},
|
||||
"parallel_workers": 10,
|
||||
"timeout_seconds": 60,
|
||||
"note": "Using only Llama 70B for cost efficiency"
|
||||
},
|
||||
"cache": {
|
||||
"enabled": true,
|
||||
"ai_cache_dir": "data/filter_cache",
|
||||
"filterset_cache_ttl_hours": 24
|
||||
},
|
||||
"pipeline": {
|
||||
"default_stages": ["categorizer", "moderator", "filter", "ranker"],
|
||||
"batch_size": 50,
|
||||
"enable_parallel": true
|
||||
},
|
||||
"output": {
|
||||
"filtered_dir": "data/filtered",
|
||||
"save_rejected": false
|
||||
}
|
||||
}
|
||||
10
filter_pipeline/__init__.py
Normal file
10
filter_pipeline/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
Filter Pipeline Package
|
||||
Content filtering, categorization, and ranking system for BalanceBoard.
|
||||
"""
|
||||
|
||||
from .engine import FilterEngine
|
||||
from .models import FilterResult, ProcessingStatus
|
||||
|
||||
__all__ = ['FilterEngine', 'FilterResult', 'ProcessingStatus']
|
||||
__version__ = '1.0.0'
|
||||
259
filter_pipeline/cache.py
Normal file
259
filter_pipeline/cache.py
Normal file
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Multi-Level Caching System
|
||||
Implements 3-tier caching for filter pipeline efficiency.
|
||||
"""
|
||||
|
||||
import json
|
||||
import hashlib
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
from .models import AIAnalysisResult, FilterResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilterCache:
|
||||
"""
|
||||
Three-level caching system:
|
||||
Level 1: In-memory cache (fastest, TTL-based)
|
||||
Level 2: AI analysis cache (persistent, content-hash based)
|
||||
Level 3: Filterset result cache (persistent, filterset version based)
|
||||
"""
|
||||
|
||||
def __init__(self, cache_dir: str = 'data/filter_cache'):
|
||||
self.cache_dir = Path(cache_dir)
|
||||
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Level 1: In-memory cache
|
||||
self.memory_cache: Dict[str, tuple[Any, datetime]] = {}
|
||||
self.memory_ttl = timedelta(minutes=5)
|
||||
|
||||
# Level 2: AI analysis cache directory
|
||||
self.ai_cache_dir = self.cache_dir / 'ai_analysis'
|
||||
self.ai_cache_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Level 3: Filterset result cache directory
|
||||
self.filterset_cache_dir = self.cache_dir / 'filtersets'
|
||||
self.filterset_cache_dir.mkdir(exist_ok=True)
|
||||
|
||||
# ===== Level 1: Memory Cache =====
|
||||
|
||||
def get_memory(self, key: str) -> Optional[Any]:
|
||||
"""Get from memory cache if not expired"""
|
||||
if key in self.memory_cache:
|
||||
value, timestamp = self.memory_cache[key]
|
||||
if datetime.now() - timestamp < self.memory_ttl:
|
||||
return value
|
||||
else:
|
||||
# Expired, remove
|
||||
del self.memory_cache[key]
|
||||
return None
|
||||
|
||||
def set_memory(self, key: str, value: Any):
|
||||
"""Store in memory cache"""
|
||||
self.memory_cache[key] = (value, datetime.now())
|
||||
|
||||
def clear_memory(self):
|
||||
"""Clear all memory cache"""
|
||||
self.memory_cache.clear()
|
||||
|
||||
# ===== Level 2: AI Analysis Cache (Persistent) =====
|
||||
|
||||
@staticmethod
|
||||
def compute_content_hash(title: str, content: str) -> str:
|
||||
"""Compute SHA-256 hash of content for caching"""
|
||||
text = f"{title}\n{content}".encode('utf-8')
|
||||
return hashlib.sha256(text).hexdigest()
|
||||
|
||||
def get_ai_analysis(self, content_hash: str) -> Optional[AIAnalysisResult]:
|
||||
"""
|
||||
Get AI analysis result from cache.
|
||||
|
||||
Args:
|
||||
content_hash: SHA-256 hash of content
|
||||
|
||||
Returns:
|
||||
AIAnalysisResult if cached, None otherwise
|
||||
"""
|
||||
# Check memory first
|
||||
mem_key = f"ai_{content_hash}"
|
||||
cached = self.get_memory(mem_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
# Check disk
|
||||
cache_file = self.ai_cache_dir / f"{content_hash}.json"
|
||||
if cache_file.exists():
|
||||
try:
|
||||
with open(cache_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
result = AIAnalysisResult.from_dict(data)
|
||||
|
||||
# Store in memory for faster access
|
||||
self.set_memory(mem_key, result)
|
||||
|
||||
logger.debug(f"AI analysis cache hit for {content_hash[:8]}...")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading AI cache {content_hash}: {e}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def set_ai_analysis(self, content_hash: str, result: AIAnalysisResult):
|
||||
"""
|
||||
Store AI analysis result in cache (persistent).
|
||||
|
||||
Args:
|
||||
content_hash: SHA-256 hash of content
|
||||
result: AIAnalysisResult to cache
|
||||
"""
|
||||
# Store in memory
|
||||
mem_key = f"ai_{content_hash}"
|
||||
self.set_memory(mem_key, result)
|
||||
|
||||
# Store on disk (persistent)
|
||||
cache_file = self.ai_cache_dir / f"{content_hash}.json"
|
||||
try:
|
||||
with open(cache_file, 'w') as f:
|
||||
json.dump(result.to_dict(), f, indent=2)
|
||||
logger.debug(f"Cached AI analysis for {content_hash[:8]}...")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving AI cache {content_hash}: {e}")
|
||||
|
||||
# ===== Level 3: Filterset Result Cache =====
|
||||
|
||||
def _get_filterset_version(self, filterset_name: str, filtersets_config: Dict) -> str:
|
||||
"""Get version hash of filterset definition for cache invalidation"""
|
||||
filterset_def = filtersets_config.get(filterset_name, {})
|
||||
# Include version field if present, otherwise hash the entire definition
|
||||
if 'version' in filterset_def:
|
||||
return str(filterset_def['version'])
|
||||
|
||||
# Compute hash of filterset definition
|
||||
definition_json = json.dumps(filterset_def, sort_keys=True)
|
||||
return hashlib.md5(definition_json.encode()).hexdigest()[:8]
|
||||
|
||||
def get_filterset_results(
|
||||
self,
|
||||
filterset_name: str,
|
||||
filterset_version: str,
|
||||
max_age_hours: int = 24
|
||||
) -> Optional[Dict[str, FilterResult]]:
|
||||
"""
|
||||
Get cached filterset results.
|
||||
|
||||
Args:
|
||||
filterset_name: Name of filterset
|
||||
filterset_version: Version hash of filterset definition
|
||||
max_age_hours: Maximum age of cache in hours
|
||||
|
||||
Returns:
|
||||
Dict mapping post_uuid to FilterResult, or None if cache invalid
|
||||
"""
|
||||
cache_file = self.filterset_cache_dir / f"{filterset_name}_{filterset_version}.json"
|
||||
|
||||
if not cache_file.exists():
|
||||
return None
|
||||
|
||||
# Check age
|
||||
try:
|
||||
file_age = datetime.now() - datetime.fromtimestamp(cache_file.stat().st_mtime)
|
||||
if file_age > timedelta(hours=max_age_hours):
|
||||
logger.debug(f"Filterset cache expired for {filterset_name}")
|
||||
return None
|
||||
|
||||
# Load cache
|
||||
with open(cache_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Deserialize FilterResults
|
||||
results = {
|
||||
uuid: FilterResult.from_dict(result_data)
|
||||
for uuid, result_data in data.items()
|
||||
}
|
||||
|
||||
logger.info(f"Filterset cache hit for {filterset_name} ({len(results)} results)")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading filterset cache {filterset_name}: {e}")
|
||||
return None
|
||||
|
||||
def set_filterset_results(
|
||||
self,
|
||||
filterset_name: str,
|
||||
filterset_version: str,
|
||||
results: Dict[str, FilterResult]
|
||||
):
|
||||
"""
|
||||
Store filterset results in cache.
|
||||
|
||||
Args:
|
||||
filterset_name: Name of filterset
|
||||
filterset_version: Version hash of filterset definition
|
||||
results: Dict mapping post_uuid to FilterResult
|
||||
"""
|
||||
cache_file = self.filterset_cache_dir / f"{filterset_name}_{filterset_version}.json"
|
||||
|
||||
try:
|
||||
# Serialize FilterResults
|
||||
data = {
|
||||
uuid: result.to_dict()
|
||||
for uuid, result in results.items()
|
||||
}
|
||||
|
||||
with open(cache_file, 'w') as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
logger.info(f"Cached {len(results)} filterset results for {filterset_name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving filterset cache {filterset_name}: {e}")
|
||||
|
||||
def invalidate_filterset(self, filterset_name: str):
|
||||
"""
|
||||
Invalidate all caches for a filterset (when definition changes).
|
||||
|
||||
Args:
|
||||
filterset_name: Name of filterset to invalidate
|
||||
"""
|
||||
pattern = f"{filterset_name}_*.json"
|
||||
for cache_file in self.filterset_cache_dir.glob(pattern):
|
||||
try:
|
||||
cache_file.unlink()
|
||||
logger.info(f"Invalidated filterset cache: {cache_file.name}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error invalidating cache {cache_file}: {e}")
|
||||
|
||||
# ===== Utility Methods =====
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics"""
|
||||
ai_cache_count = len(list(self.ai_cache_dir.glob('*.json')))
|
||||
filterset_cache_count = len(list(self.filterset_cache_dir.glob('*.json')))
|
||||
memory_cache_count = len(self.memory_cache)
|
||||
|
||||
return {
|
||||
'memory_cache_size': memory_cache_count,
|
||||
'ai_cache_size': ai_cache_count,
|
||||
'filterset_cache_size': filterset_cache_count,
|
||||
'ai_cache_dir': str(self.ai_cache_dir),
|
||||
'filterset_cache_dir': str(self.filterset_cache_dir)
|
||||
}
|
||||
|
||||
def clear_all(self):
|
||||
"""Clear all caches (use with caution!)"""
|
||||
self.clear_memory()
|
||||
|
||||
# Clear AI cache
|
||||
for cache_file in self.ai_cache_dir.glob('*.json'):
|
||||
cache_file.unlink()
|
||||
|
||||
# Clear filterset cache
|
||||
for cache_file in self.filterset_cache_dir.glob('*.json'):
|
||||
cache_file.unlink()
|
||||
|
||||
logger.warning("All filter caches cleared")
|
||||
206
filter_pipeline/config.py
Normal file
206
filter_pipeline/config.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Configuration Loader
|
||||
Loads and validates filter pipeline configuration.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilterConfig:
|
||||
"""Configuration for filter pipeline"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config_file: str = 'filter_config.json',
|
||||
filtersets_file: str = 'filtersets.json'
|
||||
):
|
||||
self.config_file = Path(config_file)
|
||||
self.filtersets_file = Path(filtersets_file)
|
||||
|
||||
# Load configurations
|
||||
self.config = self._load_config()
|
||||
self.filtersets = self._load_filtersets()
|
||||
|
||||
def _load_config(self) -> Dict:
|
||||
"""Load filter_config.json"""
|
||||
if not self.config_file.exists():
|
||||
logger.warning(f"{self.config_file} not found, using defaults")
|
||||
return self._get_default_config()
|
||||
|
||||
try:
|
||||
with open(self.config_file, 'r') as f:
|
||||
config = json.load(f)
|
||||
logger.info(f"Loaded filter config from {self.config_file}")
|
||||
return config
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading {self.config_file}: {e}")
|
||||
return self._get_default_config()
|
||||
|
||||
def _load_filtersets(self) -> Dict:
|
||||
"""Load filtersets.json"""
|
||||
if not self.filtersets_file.exists():
|
||||
logger.error(f"{self.filtersets_file} not found!")
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(self.filtersets_file, 'r') as f:
|
||||
filtersets = json.load(f)
|
||||
logger.info(f"Loaded {len(filtersets)} filtersets from {self.filtersets_file}")
|
||||
return filtersets
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading {self.filtersets_file}: {e}")
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def _get_default_config() -> Dict:
|
||||
"""Get default configuration"""
|
||||
return {
|
||||
'ai': {
|
||||
'enabled': False, # Disabled by default until API key is configured
|
||||
'openrouter_key_file': 'openrouter_key.txt',
|
||||
'models': {
|
||||
'cheap': 'meta-llama/llama-3.3-70b-instruct',
|
||||
'smart': 'anthropic/claude-3.5-sonnet'
|
||||
},
|
||||
'parallel_workers': 10,
|
||||
'timeout_seconds': 60
|
||||
},
|
||||
'cache': {
|
||||
'enabled': True,
|
||||
'ai_cache_dir': 'data/filter_cache',
|
||||
'filterset_cache_ttl_hours': 24
|
||||
},
|
||||
'pipeline': {
|
||||
'default_stages': ['categorizer', 'moderator', 'filter', 'ranker'],
|
||||
'batch_size': 50,
|
||||
'enable_parallel': True
|
||||
},
|
||||
'output': {
|
||||
'filtered_dir': 'data/filtered',
|
||||
'save_rejected': False # Don't save posts that fail filters
|
||||
}
|
||||
}
|
||||
|
||||
# ===== AI Configuration =====
|
||||
|
||||
def is_ai_enabled(self) -> bool:
|
||||
"""Check if AI processing is enabled"""
|
||||
return self.config.get('ai', {}).get('enabled', False)
|
||||
|
||||
def get_openrouter_key(self) -> Optional[str]:
|
||||
"""Get OpenRouter API key"""
|
||||
# Try environment variable first
|
||||
key = os.getenv('OPENROUTER_API_KEY')
|
||||
if key:
|
||||
return key
|
||||
|
||||
# Try key file
|
||||
key_file = self.config.get('ai', {}).get('openrouter_key_file')
|
||||
if key_file and Path(key_file).exists():
|
||||
try:
|
||||
with open(key_file, 'r') as f:
|
||||
return f.read().strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading API key from {key_file}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def get_ai_model(self, model_type: str = 'cheap') -> str:
|
||||
"""Get AI model name for a given type (cheap/smart)"""
|
||||
models = self.config.get('ai', {}).get('models', {})
|
||||
return models.get(model_type, 'meta-llama/llama-3.3-70b-instruct')
|
||||
|
||||
def get_parallel_workers(self) -> int:
|
||||
"""Get number of parallel workers for AI processing"""
|
||||
return self.config.get('ai', {}).get('parallel_workers', 10)
|
||||
|
||||
# ===== Cache Configuration =====
|
||||
|
||||
def is_cache_enabled(self) -> bool:
|
||||
"""Check if caching is enabled"""
|
||||
return self.config.get('cache', {}).get('enabled', True)
|
||||
|
||||
def get_cache_dir(self) -> str:
|
||||
"""Get cache directory path"""
|
||||
return self.config.get('cache', {}).get('ai_cache_dir', 'data/filter_cache')
|
||||
|
||||
def get_cache_ttl_hours(self) -> int:
|
||||
"""Get filterset cache TTL in hours"""
|
||||
return self.config.get('cache', {}).get('filterset_cache_ttl_hours', 24)
|
||||
|
||||
# ===== Pipeline Configuration =====
|
||||
|
||||
def get_default_stages(self) -> List[str]:
|
||||
"""Get default pipeline stages"""
|
||||
return self.config.get('pipeline', {}).get('default_stages', [
|
||||
'categorizer', 'moderator', 'filter', 'ranker'
|
||||
])
|
||||
|
||||
def get_batch_size(self) -> int:
|
||||
"""Get batch processing size"""
|
||||
return self.config.get('pipeline', {}).get('batch_size', 50)
|
||||
|
||||
def is_parallel_enabled(self) -> bool:
|
||||
"""Check if parallel processing is enabled"""
|
||||
return self.config.get('pipeline', {}).get('enable_parallel', True)
|
||||
|
||||
# ===== Filterset Methods =====
|
||||
|
||||
def get_filterset(self, name: str) -> Optional[Dict]:
|
||||
"""Get filterset configuration by name"""
|
||||
return self.filtersets.get(name)
|
||||
|
||||
def get_filterset_names(self) -> List[str]:
|
||||
"""Get list of available filterset names"""
|
||||
return list(self.filtersets.keys())
|
||||
|
||||
def get_filterset_version(self, name: str) -> Optional[str]:
|
||||
"""Get version of filterset (for cache invalidation)"""
|
||||
filterset = self.get_filterset(name)
|
||||
if not filterset:
|
||||
return None
|
||||
|
||||
# Use explicit version if present
|
||||
if 'version' in filterset:
|
||||
return str(filterset['version'])
|
||||
|
||||
# Otherwise compute hash of definition
|
||||
import hashlib
|
||||
definition_json = json.dumps(filterset, sort_keys=True)
|
||||
return hashlib.md5(definition_json.encode()).hexdigest()[:8]
|
||||
|
||||
# ===== Output Configuration =====
|
||||
|
||||
def get_filtered_dir(self) -> str:
|
||||
"""Get directory for filtered posts"""
|
||||
return self.config.get('output', {}).get('filtered_dir', 'data/filtered')
|
||||
|
||||
def should_save_rejected(self) -> bool:
|
||||
"""Check if rejected posts should be saved"""
|
||||
return self.config.get('output', {}).get('save_rejected', False)
|
||||
|
||||
# ===== Utility Methods =====
|
||||
|
||||
def reload(self):
|
||||
"""Reload configurations from disk"""
|
||||
self.config = self._load_config()
|
||||
self.filtersets = self._load_filtersets()
|
||||
logger.info("Configuration reloaded")
|
||||
|
||||
def get_config_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary of configuration"""
|
||||
return {
|
||||
'ai_enabled': self.is_ai_enabled(),
|
||||
'cache_enabled': self.is_cache_enabled(),
|
||||
'parallel_enabled': self.is_parallel_enabled(),
|
||||
'num_filtersets': len(self.filtersets),
|
||||
'filterset_names': self.get_filterset_names(),
|
||||
'default_stages': self.get_default_stages(),
|
||||
'batch_size': self.get_batch_size()
|
||||
}
|
||||
376
filter_pipeline/engine.py
Normal file
376
filter_pipeline/engine.py
Normal file
@@ -0,0 +1,376 @@
|
||||
"""
|
||||
Filter Engine
|
||||
Main orchestrator for content filtering pipeline.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import traceback
|
||||
from typing import List, Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
from .config import FilterConfig
|
||||
from .cache import FilterCache
|
||||
from .models import FilterResult, ProcessingStatus, AIAnalysisResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FilterEngine:
|
||||
"""
|
||||
Main filter pipeline orchestrator.
|
||||
|
||||
Coordinates multi-stage content filtering with intelligent caching.
|
||||
Compatible with user preferences and filterset selections.
|
||||
"""
|
||||
|
||||
_instance = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config_file: str = 'filter_config.json',
|
||||
filtersets_file: str = 'filtersets.json'
|
||||
):
|
||||
"""
|
||||
Initialize filter engine.
|
||||
|
||||
Args:
|
||||
config_file: Path to filter_config.json
|
||||
filtersets_file: Path to filtersets.json
|
||||
"""
|
||||
self.config = FilterConfig(config_file, filtersets_file)
|
||||
self.cache = FilterCache(self.config.get_cache_dir())
|
||||
|
||||
# Lazy-loaded stages (will be imported when AI is enabled)
|
||||
self._stages = None
|
||||
|
||||
logger.info("FilterEngine initialized")
|
||||
logger.info(f"Configuration: {self.config.get_config_summary()}")
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls) -> 'FilterEngine':
|
||||
"""Get singleton instance of FilterEngine"""
|
||||
if cls._instance is None:
|
||||
cls._instance = cls()
|
||||
return cls._instance
|
||||
|
||||
def _init_stages(self):
|
||||
"""Initialize pipeline stages (lazy loading)"""
|
||||
if self._stages is not None:
|
||||
return
|
||||
|
||||
from .stages.categorizer import CategorizerStage
|
||||
from .stages.moderator import ModeratorStage
|
||||
from .stages.filter import FilterStage
|
||||
from .stages.ranker import RankerStage
|
||||
|
||||
# Initialize stages based on configuration
|
||||
self._stages = {
|
||||
'categorizer': CategorizerStage(self.config, self.cache),
|
||||
'moderator': ModeratorStage(self.config, self.cache),
|
||||
'filter': FilterStage(self.config, self.cache),
|
||||
'ranker': RankerStage(self.config, self.cache)
|
||||
}
|
||||
|
||||
logger.info(f"Initialized {len(self._stages)} pipeline stages")
|
||||
|
||||
def apply_filterset(
|
||||
self,
|
||||
posts: List[Dict[str, Any]],
|
||||
filterset_name: str = 'no_filter',
|
||||
use_cache: bool = True
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Apply filterset to posts (compatible with user preferences).
|
||||
|
||||
This is the main public API used by app.py when loading user feeds.
|
||||
|
||||
Args:
|
||||
posts: List of post dictionaries
|
||||
filterset_name: Name of filterset from user settings (e.g., 'safe_content')
|
||||
use_cache: Whether to use cached results
|
||||
|
||||
Returns:
|
||||
List of posts that passed the filter, with score and metadata added
|
||||
"""
|
||||
if not posts:
|
||||
return []
|
||||
|
||||
# Validate filterset exists
|
||||
filterset = self.config.get_filterset(filterset_name)
|
||||
if not filterset:
|
||||
logger.warning(f"Filterset '{filterset_name}' not found, using 'no_filter'")
|
||||
filterset_name = 'no_filter'
|
||||
|
||||
logger.info(f"Applying filterset '{filterset_name}' to {len(posts)} posts")
|
||||
|
||||
# Check if we have cached filterset results
|
||||
if use_cache and self.config.is_cache_enabled():
|
||||
filterset_version = self.config.get_filterset_version(filterset_name)
|
||||
cached_results = self.cache.get_filterset_results(
|
||||
filterset_name,
|
||||
filterset_version,
|
||||
self.config.get_cache_ttl_hours()
|
||||
)
|
||||
|
||||
if cached_results:
|
||||
# Filter posts using cached results
|
||||
filtered_posts = []
|
||||
for post in posts:
|
||||
post_uuid = post.get('uuid')
|
||||
if post_uuid in cached_results:
|
||||
result = cached_results[post_uuid]
|
||||
if result.passed:
|
||||
# Add filter metadata to post
|
||||
post['_filter_score'] = result.score
|
||||
post['_filter_categories'] = result.categories
|
||||
post['_filter_tags'] = result.tags
|
||||
filtered_posts.append(post)
|
||||
|
||||
logger.info(f"Cache hit: {len(filtered_posts)}/{len(posts)} posts passed filter")
|
||||
return filtered_posts
|
||||
|
||||
# Cache miss or disabled - process posts through pipeline
|
||||
results = self.process_batch(posts, filterset_name)
|
||||
|
||||
# Save to filterset cache
|
||||
if self.config.is_cache_enabled():
|
||||
filterset_version = self.config.get_filterset_version(filterset_name)
|
||||
results_dict = {r.post_uuid: r for r in results}
|
||||
self.cache.set_filterset_results(filterset_name, filterset_version, results_dict)
|
||||
|
||||
# Build filtered post list
|
||||
filtered_posts = []
|
||||
results_by_uuid = {r.post_uuid: r for r in results}
|
||||
|
||||
for post in posts:
|
||||
post_uuid = post.get('uuid')
|
||||
result = results_by_uuid.get(post_uuid)
|
||||
|
||||
if result and result.passed:
|
||||
# Add filter metadata to post
|
||||
post['_filter_score'] = result.score
|
||||
post['_filter_categories'] = result.categories
|
||||
post['_filter_tags'] = result.tags
|
||||
filtered_posts.append(post)
|
||||
|
||||
logger.info(f"Processed: {len(filtered_posts)}/{len(posts)} posts passed filter")
|
||||
return filtered_posts
|
||||
|
||||
def process_batch(
|
||||
self,
|
||||
posts: List[Dict[str, Any]],
|
||||
filterset_name: str = 'no_filter'
|
||||
) -> List[FilterResult]:
|
||||
"""
|
||||
Process batch of posts through pipeline.
|
||||
|
||||
Args:
|
||||
posts: List of post dictionaries
|
||||
filterset_name: Name of filterset to apply
|
||||
|
||||
Returns:
|
||||
List of FilterResults for each post
|
||||
"""
|
||||
if not posts:
|
||||
return []
|
||||
|
||||
# Special case: no_filter passes everything with default scores
|
||||
if filterset_name == 'no_filter':
|
||||
return self._process_no_filter(posts)
|
||||
|
||||
# Initialize stages if needed
|
||||
if self.config.is_ai_enabled():
|
||||
self._init_stages()
|
||||
|
||||
# If AI is disabled but filterset requires it, fall back to no_filter
|
||||
if not self.config.is_ai_enabled() and filterset_name != 'no_filter':
|
||||
logger.warning(f"AI disabled but '{filterset_name}' requires AI - falling back to 'no_filter'")
|
||||
return self._process_no_filter(posts)
|
||||
|
||||
# Get pipeline stages for this filterset
|
||||
stage_names = self._get_stages_for_filterset(filterset_name)
|
||||
|
||||
# Process posts (parallel or sequential based on config)
|
||||
if self.config.is_parallel_enabled():
|
||||
results = self._process_batch_parallel(posts, filterset_name, stage_names)
|
||||
else:
|
||||
results = self._process_batch_sequential(posts, filterset_name, stage_names)
|
||||
|
||||
return results
|
||||
|
||||
def _process_no_filter(self, posts: List[Dict[str, Any]]) -> List[FilterResult]:
|
||||
"""Process posts with no_filter (all pass with default scores)"""
|
||||
results = []
|
||||
for post in posts:
|
||||
result = FilterResult(
|
||||
post_uuid=post.get('uuid', ''),
|
||||
passed=True,
|
||||
score=0.5, # Neutral score
|
||||
categories=[],
|
||||
tags=[],
|
||||
filterset_name='no_filter',
|
||||
processed_at=datetime.now(),
|
||||
status=ProcessingStatus.COMPLETED
|
||||
)
|
||||
results.append(result)
|
||||
|
||||
return results
|
||||
|
||||
def _get_stages_for_filterset(self, filterset_name: str) -> List[str]:
|
||||
"""Get pipeline stages to run for a filterset"""
|
||||
filterset = self.config.get_filterset(filterset_name)
|
||||
|
||||
# Check if filterset specifies custom stages
|
||||
if filterset and 'pipeline_stages' in filterset:
|
||||
return filterset['pipeline_stages']
|
||||
|
||||
# Use default stages
|
||||
return self.config.get_default_stages()
|
||||
|
||||
def _process_batch_parallel(
|
||||
self,
|
||||
posts: List[Dict[str, Any]],
|
||||
filterset_name: str,
|
||||
stage_names: List[str]
|
||||
) -> List[FilterResult]:
|
||||
"""Process posts in parallel"""
|
||||
results = [None] * len(posts)
|
||||
workers = self.config.get_parallel_workers()
|
||||
|
||||
def process_single_post(idx_post):
|
||||
idx, post = idx_post
|
||||
try:
|
||||
result = self._process_single_post(post, filterset_name, stage_names)
|
||||
return idx, result
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing post {idx}: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
# Return failed result
|
||||
return idx, FilterResult(
|
||||
post_uuid=post.get('uuid', ''),
|
||||
passed=False,
|
||||
score=0.0,
|
||||
filterset_name=filterset_name,
|
||||
processed_at=datetime.now(),
|
||||
status=ProcessingStatus.FAILED,
|
||||
error=str(e)
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
futures = {executor.submit(process_single_post, (i, post)): i
|
||||
for i, post in enumerate(posts)}
|
||||
|
||||
for future in as_completed(futures):
|
||||
idx, result = future.result()
|
||||
results[idx] = result
|
||||
|
||||
return results
|
||||
|
||||
def _process_batch_sequential(
|
||||
self,
|
||||
posts: List[Dict[str, Any]],
|
||||
filterset_name: str,
|
||||
stage_names: List[str]
|
||||
) -> List[FilterResult]:
|
||||
"""Process posts sequentially"""
|
||||
results = []
|
||||
for post in posts:
|
||||
try:
|
||||
result = self._process_single_post(post, filterset_name, stage_names)
|
||||
results.append(result)
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing post: {e}")
|
||||
results.append(FilterResult(
|
||||
post_uuid=post.get('uuid', ''),
|
||||
passed=False,
|
||||
score=0.0,
|
||||
filterset_name=filterset_name,
|
||||
processed_at=datetime.now(),
|
||||
status=ProcessingStatus.FAILED,
|
||||
error=str(e)
|
||||
))
|
||||
|
||||
return results
|
||||
|
||||
def _process_single_post(
|
||||
self,
|
||||
post: Dict[str, Any],
|
||||
filterset_name: str,
|
||||
stage_names: List[str]
|
||||
) -> FilterResult:
|
||||
"""
|
||||
Process single post through pipeline stages.
|
||||
|
||||
Stages are run sequentially: Categorizer → Moderator → Filter → Ranker
|
||||
"""
|
||||
# Initialize result
|
||||
result = FilterResult(
|
||||
post_uuid=post.get('uuid', ''),
|
||||
passed=True, # Start as passed, stages can reject
|
||||
score=0.5, # Default score
|
||||
filterset_name=filterset_name,
|
||||
processed_at=datetime.now(),
|
||||
status=ProcessingStatus.PROCESSING
|
||||
)
|
||||
|
||||
# Run each stage
|
||||
for stage_name in stage_names:
|
||||
if stage_name not in self._stages:
|
||||
logger.warning(f"Stage '{stage_name}' not found, skipping")
|
||||
continue
|
||||
|
||||
stage = self._stages[stage_name]
|
||||
|
||||
if not stage.is_enabled():
|
||||
logger.debug(f"Stage '{stage_name}' disabled, skipping")
|
||||
continue
|
||||
|
||||
# Process through stage
|
||||
try:
|
||||
result = stage.process(post, result)
|
||||
|
||||
# If post was rejected by this stage, stop processing
|
||||
if not result.passed:
|
||||
logger.debug(f"Post {post.get('uuid', '')} rejected by {stage_name}")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stage '{stage_name}': {e}")
|
||||
result.status = ProcessingStatus.FAILED
|
||||
result.error = f"{stage_name}: {str(e)}"
|
||||
result.passed = False
|
||||
break
|
||||
|
||||
# Mark as completed if not failed
|
||||
if result.status != ProcessingStatus.FAILED:
|
||||
result.status = ProcessingStatus.COMPLETED
|
||||
|
||||
return result
|
||||
|
||||
# ===== Utility Methods =====
|
||||
|
||||
def get_available_filtersets(self) -> List[str]:
|
||||
"""Get list of available filterset names (for user settings UI)"""
|
||||
return self.config.get_filterset_names()
|
||||
|
||||
def get_filterset_description(self, name: str) -> Optional[str]:
|
||||
"""Get description of a filterset (for user settings UI)"""
|
||||
filterset = self.config.get_filterset(name)
|
||||
return filterset.get('description') if filterset else None
|
||||
|
||||
def invalidate_filterset_cache(self, filterset_name: str):
|
||||
"""Invalidate cache for a filterset (when definition changes)"""
|
||||
self.cache.invalidate_filterset(filterset_name)
|
||||
logger.info(f"Invalidated cache for filterset '{filterset_name}'")
|
||||
|
||||
def get_cache_stats(self) -> Dict[str, Any]:
|
||||
"""Get cache statistics"""
|
||||
return self.cache.get_cache_stats()
|
||||
|
||||
def reload_config(self):
|
||||
"""Reload configuration from disk"""
|
||||
self.config.reload()
|
||||
self._stages = None # Force re-initialization of stages
|
||||
logger.info("Configuration reloaded")
|
||||
121
filter_pipeline/models.py
Normal file
121
filter_pipeline/models.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
Filter Pipeline Models
|
||||
Data models for filter results and processing status.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Any
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class ProcessingStatus(Enum):
|
||||
"""Status of content processing"""
|
||||
PENDING = 'pending'
|
||||
PROCESSING = 'processing'
|
||||
COMPLETED = 'completed'
|
||||
FAILED = 'failed'
|
||||
CACHED = 'cached'
|
||||
|
||||
|
||||
@dataclass
|
||||
class FilterResult:
|
||||
"""
|
||||
Result of filtering pipeline for a single post.
|
||||
|
||||
Attributes:
|
||||
post_uuid: Unique identifier for the post
|
||||
passed: Whether post passed the filter
|
||||
score: Relevance/quality score (0.0-1.0)
|
||||
categories: Detected categories/topics
|
||||
tags: Additional tags applied
|
||||
moderation_data: Safety and quality analysis results
|
||||
filterset_name: Name of filterset applied
|
||||
cache_key: Content hash for caching
|
||||
processed_at: Timestamp of processing
|
||||
status: Processing status
|
||||
error: Error message if failed
|
||||
"""
|
||||
post_uuid: str
|
||||
passed: bool
|
||||
score: float
|
||||
categories: List[str] = field(default_factory=list)
|
||||
tags: List[str] = field(default_factory=list)
|
||||
moderation_data: Dict[str, Any] = field(default_factory=dict)
|
||||
filterset_name: str = 'no_filter'
|
||||
cache_key: Optional[str] = None
|
||||
processed_at: Optional[datetime] = None
|
||||
status: ProcessingStatus = ProcessingStatus.PENDING
|
||||
error: Optional[str] = None
|
||||
|
||||
# Detailed scoring breakdown
|
||||
score_breakdown: Dict[str, float] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for JSON serialization"""
|
||||
return {
|
||||
'post_uuid': self.post_uuid,
|
||||
'passed': self.passed,
|
||||
'score': self.score,
|
||||
'categories': self.categories,
|
||||
'tags': self.tags,
|
||||
'moderation_data': self.moderation_data,
|
||||
'filterset_name': self.filterset_name,
|
||||
'cache_key': self.cache_key,
|
||||
'processed_at': self.processed_at.isoformat() if self.processed_at else None,
|
||||
'status': self.status.value if isinstance(self.status, ProcessingStatus) else self.status,
|
||||
'error': self.error,
|
||||
'score_breakdown': self.score_breakdown
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict) -> 'FilterResult':
|
||||
"""Create from dictionary"""
|
||||
# Handle datetime deserialization
|
||||
if data.get('processed_at') and isinstance(data['processed_at'], str):
|
||||
data['processed_at'] = datetime.fromisoformat(data['processed_at'])
|
||||
|
||||
# Handle enum deserialization
|
||||
if data.get('status') and isinstance(data['status'], str):
|
||||
data['status'] = ProcessingStatus(data['status'])
|
||||
|
||||
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
||||
|
||||
|
||||
@dataclass
|
||||
class AIAnalysisResult:
|
||||
"""
|
||||
Result of AI analysis (categorization, moderation, etc).
|
||||
Cached separately from FilterResult for reuse across filtersets.
|
||||
"""
|
||||
content_hash: str
|
||||
categories: List[str] = field(default_factory=list)
|
||||
category_scores: Dict[str, float] = field(default_factory=dict)
|
||||
moderation: Dict[str, Any] = field(default_factory=dict)
|
||||
quality_score: float = 0.5
|
||||
sentiment: Optional[str] = None
|
||||
sentiment_score: float = 0.0
|
||||
analyzed_at: Optional[datetime] = None
|
||||
model_used: Optional[str] = None
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary for JSON serialization"""
|
||||
return {
|
||||
'content_hash': self.content_hash,
|
||||
'categories': self.categories,
|
||||
'category_scores': self.category_scores,
|
||||
'moderation': self.moderation,
|
||||
'quality_score': self.quality_score,
|
||||
'sentiment': self.sentiment,
|
||||
'sentiment_score': self.sentiment_score,
|
||||
'analyzed_at': self.analyzed_at.isoformat() if self.analyzed_at else None,
|
||||
'model_used': self.model_used
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict) -> 'AIAnalysisResult':
|
||||
"""Create from dictionary"""
|
||||
if data.get('analyzed_at') and isinstance(data['analyzed_at'], str):
|
||||
data['analyzed_at'] = datetime.fromisoformat(data['analyzed_at'])
|
||||
|
||||
return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
|
||||
8
filter_pipeline/plugins/__init__.py
Normal file
8
filter_pipeline/plugins/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Filter Plugins
|
||||
Pluggable filters for content filtering.
|
||||
"""
|
||||
|
||||
from .base import BaseFilterPlugin
|
||||
|
||||
__all__ = ['BaseFilterPlugin']
|
||||
66
filter_pipeline/plugins/base.py
Normal file
66
filter_pipeline/plugins/base.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Base Filter Plugin
|
||||
Abstract base class for all filter plugins.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
class BaseFilterPlugin(ABC):
|
||||
"""
|
||||
Abstract base class for filter plugins.
|
||||
|
||||
Plugins can be used within stages to implement specific filtering logic.
|
||||
Examples: keyword filtering, AI-based filtering, quality scoring, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any]):
|
||||
"""
|
||||
Initialize plugin.
|
||||
|
||||
Args:
|
||||
config: Plugin configuration dictionary
|
||||
"""
|
||||
self.config = config
|
||||
self.enabled = config.get('enabled', True)
|
||||
|
||||
@abstractmethod
|
||||
def should_filter(self, post: Dict[str, Any], context: Optional[Dict] = None) -> bool:
|
||||
"""
|
||||
Determine if post should be filtered OUT.
|
||||
|
||||
Args:
|
||||
post: Post data dictionary
|
||||
context: Optional context from previous stages
|
||||
|
||||
Returns:
|
||||
True if post should be filtered OUT (rejected), False to keep it
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def score(self, post: Dict[str, Any], context: Optional[Dict] = None) -> float:
|
||||
"""
|
||||
Calculate relevance/quality score for post.
|
||||
|
||||
Args:
|
||||
post: Post data dictionary
|
||||
context: Optional context from previous stages
|
||||
|
||||
Returns:
|
||||
Score from 0.0 (lowest) to 1.0 (highest)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_name(self) -> str:
|
||||
"""Get plugin name for logging"""
|
||||
pass
|
||||
|
||||
def is_enabled(self) -> bool:
|
||||
"""Check if plugin is enabled"""
|
||||
return self.enabled
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.get_name()} enabled={self.enabled}>"
|
||||
8
filter_pipeline/stages/__init__.py
Normal file
8
filter_pipeline/stages/__init__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Pipeline Stages
|
||||
Sequential processing stages for content filtering.
|
||||
"""
|
||||
|
||||
from .base_stage import BaseStage
|
||||
|
||||
__all__ = ['BaseStage']
|
||||
62
filter_pipeline/stages/base_stage.py
Normal file
62
filter_pipeline/stages/base_stage.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Base Stage
|
||||
Abstract base class for all pipeline stages.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any
|
||||
from ..models import FilterResult
|
||||
|
||||
|
||||
class BaseStage(ABC):
|
||||
"""
|
||||
Abstract base class for pipeline stages.
|
||||
|
||||
Each stage processes posts sequentially and can modify FilterResults.
|
||||
Stages are executed in order: Categorizer → Moderator → Filter → Ranker
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any], cache: Any):
|
||||
"""
|
||||
Initialize stage.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary for this stage
|
||||
cache: FilterCache instance
|
||||
"""
|
||||
self.config = config
|
||||
self.cache = cache
|
||||
self.enabled = config.get('enabled', True)
|
||||
|
||||
@abstractmethod
|
||||
def process(
|
||||
self,
|
||||
post: Dict[str, Any],
|
||||
result: FilterResult
|
||||
) -> FilterResult:
|
||||
"""
|
||||
Process a single post and update its FilterResult.
|
||||
|
||||
Args:
|
||||
post: Post data dictionary
|
||||
result: Current FilterResult for this post
|
||||
|
||||
Returns:
|
||||
Updated FilterResult
|
||||
|
||||
Raises:
|
||||
Exception if processing fails
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_name(self) -> str:
|
||||
"""Get stage name for logging"""
|
||||
pass
|
||||
|
||||
def is_enabled(self) -> bool:
|
||||
"""Check if stage is enabled"""
|
||||
return self.enabled
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.get_name()} enabled={self.enabled}>"
|
||||
Reference in New Issue
Block a user