Compare commits

6 Commits
main ... lnwc

Author SHA1 Message Date
chelsea
9e5f27316e worked on hackernews schema converter 2025-10-12 21:16:15 -05:00
chelsea
1a6ad08079 fix(data): Correct post source field before saving to fix community filtering 2025-10-12 20:53:15 -05:00
chelsea
1a999ab00b additional debugging to find user_communinties value 2025-10-12 20:38:26 -05:00
chelsea
72b453d6dd additional debugging added to api_post() 2025-10-12 20:19:03 -05:00
chelsea
ea24102053 refactored api_posts() in app.py and added some debugging to trace issue 28 2025-10-12 19:42:01 -05:00
fecafc15ee fixed file permission error causing reboot loop
d
2025-10-12 23:40:33,603 - apscheduler.scheduler - INFO - Scheduler started
2025-10-12 23:40:33,605 - polling_service - INFO - Polling scheduler started
2025-10-12 23:40:33,605 - apscheduler.scheduler - INFO - Added job "Check and poll sources" to job store "default"
2025-10-12 23:40:33,606 - polling_service - INFO - Poll checker job scheduled
2025-10-12 23:40:33,610 - filter_pipeline.config - INFO - Loaded filter config from filter_config.json
2025-10-12 23:40:33,610 - filter_pipeline.config - INFO - Loaded 5 filtersets from filtersets.json
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/site-packages/flask/__main__.py", line 3, in <module>
    main()
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 1131, in main
    cli.main()
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1383, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1850, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1246, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 814, in invoke
    return callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/decorators.py", line 93, in new_func
    return ctx.invoke(f, obj, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 814, in invoke
    return callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 979, in run_command
    raise e from None
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 963, in run_command
    app: WSGIApplication = info.load_app()  # pyright: ignore
                           ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 349, in load_app
    app = locate_app(import_name, name)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 245, in locate_app
    __import__(module_name)
  File "/app/app.py", line 91, in <module>
✓ Database tables created
    filter_engine = FilterEngine.get_instance()
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/filter_pipeline/engine.py", line 55, in get_instance
    cls._instance = cls()
                    ^^^^^
  File "/app/filter_pipeline/engine.py", line 43, in __init__
    self.cache = FilterCache(self.config.get_cache_dir())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/filter_pipeline/cache.py", line 28, in __init__
    self.cache_dir.mkdir(parents=True, exist_ok=True)
  File "/usr/local/lib/python3.12/pathlib.py", line 1311, in mkdir
    os.mkdir(self, mode)
PermissionError: [Errno 13] Permission denied: 'data/filter_cach
2025-10-12 23:43:42 +00:00
4 changed files with 62 additions and 36 deletions

72
app.py
View File

@@ -432,55 +432,67 @@ def api_posts():
cutoff_date = datetime.utcnow() - timedelta(days=time_filter_days) cutoff_date = datetime.utcnow() - timedelta(days=time_filter_days)
time_cutoff = cutoff_date.timestamp() time_cutoff = cutoff_date.timestamp()
# Collect raw posts for filtering # ====================================================================
raw_posts = [] # START OF REFACTORED SECTION
for post_uuid, post_data in cached_posts.items(): # ====================================================================
# Apply time filter first if enabled
def _post_should_be_included(post_data):
"""Check if a post passes all pre-filterset criteria."""
# Apply time filter
if time_filter_enabled and time_cutoff: if time_filter_enabled and time_cutoff:
post_timestamp = post_data.get('timestamp', 0) if post_data.get('timestamp', 0) < time_cutoff:
if post_timestamp < time_cutoff: return False
continue
# Apply community filter (before filterset) # Apply community filter
if community and post_data.get('source', '').lower() != community.lower(): if community and post_data.get('source', '').lower() != community.lower():
continue return False
# Apply platform filter (before filterset) # Apply platform filter
if platform and post_data.get('platform', '').lower() != platform.lower(): if platform and post_data.get('platform', '').lower() != platform.lower():
continue return False
# Apply user's community preferences (before filterset) # Apply user's community preferences
if user_communities: if user_communities:
post_source = post_data.get('source', '').lower() post_source = post_data.get('source', '').lower()
post_platform = post_data.get('platform', '').lower() post_platform = post_data.get('platform', '').lower()
if not any(
# Check if this post matches any of the user's selected communities post_source == c or post_platform == c or c in post_source
matches_community = False for c in user_communities
for selected_community in user_communities: ):
selected_community = selected_community.lower() # ====================================================================
# Match by exact source name or platform name # MODIFICATION: Add logging here
if (post_source == selected_community or # ====================================================================
post_platform == selected_community or logger.error(
selected_community in post_source): f"Post filtered out for user {current_user.id if current_user.is_authenticated else 'anonymous'}: "
matches_community = True f"Community mismatch. Platform='{post_platform}', Source='{post_source}', "
break f"User Communities={user_communities}"
)
if not matches_community: # ====================================================================
continue return False
# Apply search filter (before filterset) # Apply search filter
if search_query: if search_query:
title = post_data.get('title', '').lower() title = post_data.get('title', '').lower()
content = post_data.get('content', '').lower() content = post_data.get('content', '').lower()
author = post_data.get('author', '').lower() author = post_data.get('author', '').lower()
source = post_data.get('source', '').lower() source = post_data.get('source', '').lower()
if not (search_query in title or if not (search_query in title or
search_query in content or search_query in content or
search_query in author or search_query in author or
search_query in source): search_query in source):
continue return False
return True
raw_posts.append(post_data) # Collect raw posts using a clean, declarative list comprehension
raw_posts = [
post_data for post_data in cached_posts.values()
if _post_should_be_included(post_data)
]
# ====================================================================
# END OF REFACTORED SECTION
# ====================================================================
# Apply filterset using FilterEngine # Apply filterset using FilterEngine
filtered_posts = filter_engine.apply_filterset(raw_posts, filterset_name, use_cache=True) filtered_posts = filter_engine.apply_filterset(raw_posts, filterset_name, use_cache=True)

View File

@@ -211,6 +211,12 @@ def collect_platform(platform: str, community: str, start_date: str, end_date: s
if post_id in index: if post_id in index:
continue continue
# ====================================================================
# FIX: Correct the post's source field BEFORE saving
# ====================================================================
post['source'] = community if community else platform
# ====================================================================
# Save post # Save post
post_uuid = save_post(post, platform, index, dirs) post_uuid = save_post(post, platform, index, dirs)
added_count += 1 added_count += 1

View File

@@ -292,8 +292,10 @@ class data_methods():
'meta': {'is_self': post.get('is_self', False)} 'meta': {'is_self': post.get('is_self', False)}
} }
# In data_methods.converters.hackernews_to_schema()
@staticmethod @staticmethod
def hackernews_to_schema(raw): def hackernews_to_schema(raw, community='front_page'): # Add community parameter
if not raw or raw.get('type') != 'story': if not raw or raw.get('type') != 'story':
return None return None
return { return {
@@ -306,7 +308,11 @@ class data_methods():
'replies': raw.get('descendants', 0), 'replies': raw.get('descendants', 0),
'url': raw.get('url', f"https://news.ycombinator.com/item?id={raw.get('id')}"), 'url': raw.get('url', f"https://news.ycombinator.com/item?id={raw.get('id')}"),
'content': raw.get('text', ''), 'content': raw.get('text', ''),
'source': 'hackernews', # ====================================================================
# FIX: Use the community parameter for the source
# ====================================================================
'source': community,
# ====================================================================
'tags': ['hackernews'], 'tags': ['hackernews'],
'meta': {} 'meta': {}
} }
@@ -681,7 +687,7 @@ class data_methods():
stories.append(data_methods.utils.http_get_json(story_url)) stories.append(data_methods.utils.http_get_json(story_url))
# Convert and filter # Convert and filter
posts = [data_methods.converters.hackernews_to_schema(s) for s in stories] posts = [data_methods.converters.hackernews_to_schema(s, community) for s in stories]
return data_methods.utils.filter_by_date_range(posts, start_date, end_date) return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
@staticmethod @staticmethod

View File

@@ -48,8 +48,9 @@ services:
AUTH0_CLIENT_SECRET: ${AUTH0_CLIENT_SECRET:-} AUTH0_CLIENT_SECRET: ${AUTH0_CLIENT_SECRET:-}
AUTH0_AUDIENCE: ${AUTH0_AUDIENCE:-} AUTH0_AUDIENCE: ${AUTH0_AUDIENCE:-}
volumes: volumes:
# Persistent data storage # Application-managed data (using a named volume)
- ./data:/app/data - app_data:/app/data
# User-editable content (using bind mounts)
- ./static:/app/static - ./static:/app/static
- ./backups:/app/backups - ./backups:/app/backups
- ./active_html:/app/active_html - ./active_html:/app/active_html
@@ -71,4 +72,5 @@ networks:
driver: bridge driver: bridge
volumes: volumes:
postgres_data: postgres_data:
app_data: # <-- New named volume declared here