Compare commits

6 Commits
main ... lnwc

Author SHA1 Message Date
chelsea
9e5f27316e worked on hackernews schema converter 2025-10-12 21:16:15 -05:00
chelsea
1a6ad08079 fix(data): Correct post source field before saving to fix community filtering 2025-10-12 20:53:15 -05:00
chelsea
1a999ab00b additional debugging to find user_communinties value 2025-10-12 20:38:26 -05:00
chelsea
72b453d6dd additional debugging added to api_post() 2025-10-12 20:19:03 -05:00
chelsea
ea24102053 refactored api_posts() in app.py and added some debugging to trace issue 28 2025-10-12 19:42:01 -05:00
fecafc15ee fixed file permission error causing reboot loop
d
2025-10-12 23:40:33,603 - apscheduler.scheduler - INFO - Scheduler started
2025-10-12 23:40:33,605 - polling_service - INFO - Polling scheduler started
2025-10-12 23:40:33,605 - apscheduler.scheduler - INFO - Added job "Check and poll sources" to job store "default"
2025-10-12 23:40:33,606 - polling_service - INFO - Poll checker job scheduled
2025-10-12 23:40:33,610 - filter_pipeline.config - INFO - Loaded filter config from filter_config.json
2025-10-12 23:40:33,610 - filter_pipeline.config - INFO - Loaded 5 filtersets from filtersets.json
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/site-packages/flask/__main__.py", line 3, in <module>
    main()
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 1131, in main
    cli.main()
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1383, in main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1850, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 1246, in invoke
    return ctx.invoke(self.callback, **ctx.params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 814, in invoke
    return callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/decorators.py", line 93, in new_func
    return ctx.invoke(f, obj, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/click/core.py", line 814, in invoke
    return callback(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 979, in run_command
    raise e from None
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 963, in run_command
    app: WSGIApplication = info.load_app()  # pyright: ignore
                           ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 349, in load_app
    app = locate_app(import_name, name)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/site-packages/flask/cli.py", line 245, in locate_app
    __import__(module_name)
  File "/app/app.py", line 91, in <module>
✓ Database tables created
    filter_engine = FilterEngine.get_instance()
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/filter_pipeline/engine.py", line 55, in get_instance
    cls._instance = cls()
                    ^^^^^
  File "/app/filter_pipeline/engine.py", line 43, in __init__
    self.cache = FilterCache(self.config.get_cache_dir())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/app/filter_pipeline/cache.py", line 28, in __init__
    self.cache_dir.mkdir(parents=True, exist_ok=True)
  File "/usr/local/lib/python3.12/pathlib.py", line 1311, in mkdir
    os.mkdir(self, mode)
PermissionError: [Errno 13] Permission denied: 'data/filter_cach
2025-10-12 23:43:42 +00:00
4 changed files with 62 additions and 36 deletions

72
app.py
View File

@@ -432,55 +432,67 @@ def api_posts():
cutoff_date = datetime.utcnow() - timedelta(days=time_filter_days)
time_cutoff = cutoff_date.timestamp()
# Collect raw posts for filtering
raw_posts = []
for post_uuid, post_data in cached_posts.items():
# Apply time filter first if enabled
# ====================================================================
# START OF REFACTORED SECTION
# ====================================================================
def _post_should_be_included(post_data):
"""Check if a post passes all pre-filterset criteria."""
# Apply time filter
if time_filter_enabled and time_cutoff:
post_timestamp = post_data.get('timestamp', 0)
if post_timestamp < time_cutoff:
continue
# Apply community filter (before filterset)
if post_data.get('timestamp', 0) < time_cutoff:
return False
# Apply community filter
if community and post_data.get('source', '').lower() != community.lower():
continue
return False
# Apply platform filter (before filterset)
# Apply platform filter
if platform and post_data.get('platform', '').lower() != platform.lower():
continue
return False
# Apply user's community preferences (before filterset)
# Apply user's community preferences
if user_communities:
post_source = post_data.get('source', '').lower()
post_platform = post_data.get('platform', '').lower()
# Check if this post matches any of the user's selected communities
matches_community = False
for selected_community in user_communities:
selected_community = selected_community.lower()
# Match by exact source name or platform name
if (post_source == selected_community or
post_platform == selected_community or
selected_community in post_source):
matches_community = True
break
if not matches_community:
continue
if not any(
post_source == c or post_platform == c or c in post_source
for c in user_communities
):
# ====================================================================
# MODIFICATION: Add logging here
# ====================================================================
logger.error(
f"Post filtered out for user {current_user.id if current_user.is_authenticated else 'anonymous'}: "
f"Community mismatch. Platform='{post_platform}', Source='{post_source}', "
f"User Communities={user_communities}"
)
# ====================================================================
return False
# Apply search filter (before filterset)
# Apply search filter
if search_query:
title = post_data.get('title', '').lower()
content = post_data.get('content', '').lower()
author = post_data.get('author', '').lower()
source = post_data.get('source', '').lower()
if not (search_query in title or
search_query in content or
search_query in author or
search_query in source):
continue
return False
return True
raw_posts.append(post_data)
# Collect raw posts using a clean, declarative list comprehension
raw_posts = [
post_data for post_data in cached_posts.values()
if _post_should_be_included(post_data)
]
# ====================================================================
# END OF REFACTORED SECTION
# ====================================================================
# Apply filterset using FilterEngine
filtered_posts = filter_engine.apply_filterset(raw_posts, filterset_name, use_cache=True)

View File

@@ -211,6 +211,12 @@ def collect_platform(platform: str, community: str, start_date: str, end_date: s
if post_id in index:
continue
# ====================================================================
# FIX: Correct the post's source field BEFORE saving
# ====================================================================
post['source'] = community if community else platform
# ====================================================================
# Save post
post_uuid = save_post(post, platform, index, dirs)
added_count += 1

View File

@@ -292,8 +292,10 @@ class data_methods():
'meta': {'is_self': post.get('is_self', False)}
}
# In data_methods.converters.hackernews_to_schema()
@staticmethod
def hackernews_to_schema(raw):
def hackernews_to_schema(raw, community='front_page'): # Add community parameter
if not raw or raw.get('type') != 'story':
return None
return {
@@ -306,7 +308,11 @@ class data_methods():
'replies': raw.get('descendants', 0),
'url': raw.get('url', f"https://news.ycombinator.com/item?id={raw.get('id')}"),
'content': raw.get('text', ''),
'source': 'hackernews',
# ====================================================================
# FIX: Use the community parameter for the source
# ====================================================================
'source': community,
# ====================================================================
'tags': ['hackernews'],
'meta': {}
}
@@ -681,7 +687,7 @@ class data_methods():
stories.append(data_methods.utils.http_get_json(story_url))
# Convert and filter
posts = [data_methods.converters.hackernews_to_schema(s) for s in stories]
posts = [data_methods.converters.hackernews_to_schema(s, community) for s in stories]
return data_methods.utils.filter_by_date_range(posts, start_date, end_date)
@staticmethod

View File

@@ -48,8 +48,9 @@ services:
AUTH0_CLIENT_SECRET: ${AUTH0_CLIENT_SECRET:-}
AUTH0_AUDIENCE: ${AUTH0_AUDIENCE:-}
volumes:
# Persistent data storage
- ./data:/app/data
# Application-managed data (using a named volume)
- app_data:/app/data
# User-editable content (using bind mounts)
- ./static:/app/static
- ./backups:/app/backups
- ./active_html:/app/active_html
@@ -71,4 +72,5 @@ networks:
driver: bridge
volumes:
postgres_data:
postgres_data:
app_data: # <-- New named volume declared here