Tom Claude commited on
Commit
d49ed4a
·
1 Parent(s): b516494

Improve search: fix tag info, use ILIKE, add FTS, increase limit to 50

Browse files

- Fix incorrect tag coverage info (was "3 posts", actually 3,362 posts / 88%)
- Replace word boundary regex with ILIKE for flexible matching
(now "Russia" finds "Russian", "Russia's", etc.)
- Add FTS query guidance with ts_rank() for relevance ranking
- Remove non-existent 'dead' column references from schema
- Increase default result limit from 9 to 50 for better coverage
- Update all SQL examples to use ILIKE patterns

Database migrations applied:
- add_fts_to_posts: Added FTS tsvector column with GIN index
- add_trigram_indexes: Enabled pg_trgm extension for fuzzy matching

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show
  1. src/query_intent_classifier.py +16 -14
  2. src/vanna.py +39 -26
src/query_intent_classifier.py CHANGED
@@ -165,30 +165,32 @@ class IntentClassifier:
165
  tags = classification["tags"]
166
 
167
  if intent == QueryIntent.KEYWORD:
 
168
  return f"""
169
  Search using KEYWORD approach:
170
  - Search terms: {', '.join(keywords)}
171
  - Search in: posts.title, posts.author, providers.name
172
- - Use ILIKE with wildcards for flexible matching
173
- - Do not filter by tags (most posts are not tagged yet)
 
174
  """
175
 
176
  elif intent == QueryIntent.TAG:
177
  return f"""
178
  Search using TAG approach:
179
  - Tag names: {', '.join(tags)}
180
- - Use LOWER() for case-insensitive matching
 
181
  - Join with post_tags and tags tables
182
- - Note: Only a few posts are tagged, results may be limited
183
  """
184
 
185
  else: # HYBRID
186
  return f"""
187
- Search using HYBRID approach:
188
- - Try tags first: {', '.join(tags)}
189
- - Fall back to keywords: {', '.join(keywords)}
190
  - Use OR logic: tag matches OR keyword matches in title/author
191
- - This maximizes results since most posts are not tagged yet
192
 
193
  Recommended SQL pattern:
194
  SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
@@ -196,13 +198,13 @@ FROM posts p
196
  LEFT JOIN post_tags pt ON p.id = pt.post_id
197
  LEFT JOIN tags t ON pt.tag_id = t.id
198
  LEFT JOIN providers pr ON p.provider_id = pr.id
199
- WHERE
200
- LOWER(t.name) = ANY(ARRAY[{', '.join(f"'{tag}'" for tag in tags)}])
201
- OR LOWER(p.title) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
202
- OR LOWER(p.author) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
203
- OR LOWER(pr.name) LIKE ANY(ARRAY[{', '.join(f"'%{kw}%'" for kw in keywords)}])
204
  ORDER BY p.published_date DESC NULLS LAST
205
- LIMIT 9
206
  """
207
 
208
 
 
165
  tags = classification["tags"]
166
 
167
  if intent == QueryIntent.KEYWORD:
168
+ keyword_example = keywords[0] if keywords else "keyword"
169
  return f"""
170
  Search using KEYWORD approach:
171
  - Search terms: {', '.join(keywords)}
172
  - Search in: posts.title, posts.author, providers.name
173
+ - Use LOWER(column) LIKE '%keyword%' for flexible matching
174
+ - Example: LOWER(p.title) LIKE '%{keyword_example}%'
175
+ - This matches word variants: '{keyword_example}', '{keyword_example}n', '{keyword_example}\\'s', etc.
176
  """
177
 
178
  elif intent == QueryIntent.TAG:
179
  return f"""
180
  Search using TAG approach:
181
  - Tag names: {', '.join(tags)}
182
+ - 88% of posts (3,362) have tags - tag search is highly effective!
183
+ - Use LOWER(t.name) LIKE '%tagname%' for flexible matching
184
  - Join with post_tags and tags tables
 
185
  """
186
 
187
  else: # HYBRID
188
  return f"""
189
+ Search using HYBRID approach (RECOMMENDED):
190
+ - Tags to search: {', '.join(tags)}
191
+ - Keywords to search: {', '.join(keywords)}
192
  - Use OR logic: tag matches OR keyword matches in title/author
193
+ - 88% of posts have tags, so tag search is primary
194
 
195
  Recommended SQL pattern:
196
  SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
 
198
  LEFT JOIN post_tags pt ON p.id = pt.post_id
199
  LEFT JOIN tags t ON pt.tag_id = t.id
200
  LEFT JOIN providers pr ON p.provider_id = pr.id
201
+ WHERE (
202
+ {' OR '.join(f"LOWER(t.name) LIKE '%{tag}%'" for tag in tags)}
203
+ OR {' OR '.join(f"LOWER(p.title) LIKE '%{kw}%'" for kw in keywords)}
204
+ OR {' OR '.join(f"LOWER(p.author) LIKE '%{kw}%'" for kw in keywords)}
205
+ )
206
  ORDER BY p.published_date DESC NULLS LAST
207
+ LIMIT 50
208
  """
209
 
210
 
src/vanna.py CHANGED
@@ -68,7 +68,7 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
68
  prompt += (
69
  "\n## Database Schema\n"
70
  "Tables:\n"
71
- "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at, dead)\n"
72
  "- providers (id, name)\n"
73
  "- provider_attributes (id, provider_id, type, name)\n"
74
  "- post_provider_attributes (post_id, attribute_id)\n"
@@ -96,7 +96,7 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
96
  "- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
97
  "- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
98
  "- `post_tags.weight`: relevance score between a post and a tag.\n"
99
- "- `posts.dead`: boolean flag indicating if the post is dead/removed (true = dead, false = active).\n"
100
  )
101
 
102
  # ======================
@@ -104,33 +104,38 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
104
  # ======================
105
  prompt += (
106
  "\n## Business Logic\n"
107
- "- **ALWAYS filter out dead posts**: Include `WHERE p.dead = false` (or `AND p.dead = false`) in every query. Never return posts where dead = true.\n"
108
  "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
109
  "- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
110
  "- Tags link posts to specific themes or disciplines.\n"
111
  "- A single post may have multiple tags, awards, or categories.\n"
112
  "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
113
  "- If the user says 'recently', filter posts from the last 90 days.\n"
114
- "- Always limit exploratory results to 9 rows.\n"
115
  "\n"
116
- "## CRITICAL: Search Strategy\n"
117
- "**IMPORTANT**: Only 3 posts currently have tags. Most posts (7,245+) are NOT tagged yet.\n"
 
 
118
  "\n"
119
  "**Hybrid Search Approach (RECOMMENDED)**:\n"
120
- "- ALWAYS use a hybrid approach combining tag search AND keyword search with OR logic.\n"
121
- "- Use LEFT JOINs for tags (not INNER JOIN) so untagged posts are included.\n"
122
  "\n"
123
- "**Keyword Matching - Use PostgreSQL Regex for Exact Word Boundaries**:\n"
124
- "- Use ~* operator for case-insensitive regex matching\n"
125
- "- Use \\m and \\M for word boundaries (start and end of word)\n"
126
- "- Pattern: column ~* '\\\\mkeyword\\\\M'\n"
127
- "- Example: p.title ~* '\\\\mf1\\\\M' matches 'F1' but NOT 'profile' or 'if'\n"
128
- "- This ensures exact word matching, not substring matching\n"
129
  "\n"
130
- "**When to use tag-only search**: Only if user explicitly mentions 'tagged with' or 'tag:'.\n"
131
- "**When to use keyword-only search**: For author/organization names, or when tags are not relevant.\n"
 
 
 
 
132
  "\n"
133
- "This ensures maximum result coverage while the database is being enriched with tags.\n"
 
134
  )
135
 
136
  # ======================
@@ -167,24 +172,32 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
167
  "LEFT JOIN post_tags pt ON p.id = pt.post_id "
168
  "LEFT JOIN tags t ON pt.tag_id = t.id "
169
  "LEFT JOIN providers pr ON p.provider_id = pr.id "
170
- "WHERE p.dead = false AND (t.name ~* '\\\\mf1\\\\M' OR t.name ~* '\\\\mformula\\\\M' "
171
- "OR p.title ~* '\\\\mf1\\\\M' OR p.title ~* '\\\\mformula\\\\M' "
172
- "OR p.author ~* '\\\\mf1\\\\M') "
173
- "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
174
  "\nUser: 'Show me posts from The New York Times'\n"
175
  "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
176
  "FROM posts p "
177
  "LEFT JOIN providers pr ON p.provider_id = pr.id "
178
- "WHERE p.dead = false AND (p.author ~* '\\\\mnew\\\\M.*\\\\myork\\\\M.*\\\\mtimes\\\\M' OR pr.name ~* '\\\\mnew\\\\M.*\\\\myork\\\\M.*\\\\mtimes\\\\M') "
179
- "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
 
 
 
 
 
 
 
 
180
  "\nUser: 'interactive visualizations'\n"
181
  "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
182
  "FROM posts p "
183
  "LEFT JOIN post_tags pt ON p.id = pt.post_id "
184
  "LEFT JOIN tags t ON pt.tag_id = t.id "
185
- "WHERE p.dead = false AND (t.name ~* '\\\\minteractive\\\\M' OR p.title ~* '\\\\minteractive\\\\M' "
186
- "OR p.title ~* '\\\\mvisualization\\\\M' OR t.name ~* '\\\\mdataviz\\\\M') "
187
- "ORDER BY p.published_date DESC NULLS LAST LIMIT 9;\"]\n"
188
  )
189
 
190
  # ======================
 
68
  prompt += (
69
  "\n## Database Schema\n"
70
  "Tables:\n"
71
+ "- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at, content_markdown, fts)\n"
72
  "- providers (id, name)\n"
73
  "- provider_attributes (id, provider_id, type, name)\n"
74
  "- post_provider_attributes (post_id, attribute_id)\n"
 
96
  "- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
97
  "- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
98
  "- `post_tags.weight`: relevance score between a post and a tag.\n"
99
+ "- `posts.fts`: tsvector column for full-text search (auto-generated from title and author).\n"
100
  )
101
 
102
  # ======================
 
104
  # ======================
105
  prompt += (
106
  "\n## Business Logic\n"
 
107
  "- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
108
  "- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
109
  "- Tags link posts to specific themes or disciplines.\n"
110
  "- A single post may have multiple tags, awards, or categories.\n"
111
  "- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
112
  "- If the user says 'recently', filter posts from the last 90 days.\n"
113
+ "- Default limit is 50 rows for search results. Use OFFSET for pagination if needed.\n"
114
  "\n"
115
+ "## Search Strategy\n"
116
+ "**TAG COVERAGE**: 3,362 posts (88%) have tags. Tag-based search is highly effective!\n"
117
+ "- 9,105 tags available including countries (russia, china, usa), topics (climate change, politics), and formats (interactive, dataviz)\n"
118
+ "- Use tag matching as PRIMARY search for topic-based queries\n"
119
  "\n"
120
  "**Hybrid Search Approach (RECOMMENDED)**:\n"
121
+ "- Combine tag search AND keyword search with OR logic for maximum coverage\n"
122
+ "- Use LEFT JOINs for tags to also include the 12% of untagged posts\n"
123
  "\n"
124
+ "**Keyword Matching - Use ILIKE for Flexible Matching**:\n"
125
+ "- Use LOWER(column) LIKE '%keyword%' for case-insensitive substring matching\n"
126
+ "- Example: LOWER(p.title) LIKE '%russia%' matches 'Russia', 'Russian', 'Russia\\'s', etc.\n"
127
+ "- This ensures word variants are captured (much better than exact word boundary matching)\n"
128
+ "- For multi-word searches: LOWER(p.title) LIKE '%new york%'\n"
 
129
  "\n"
130
+ "**Full-Text Search (for relevance ranking)**:\n"
131
+ "- The posts table has an 'fts' column (tsvector) for full-text search\n"
132
+ "- Use: p.fts @@ plainto_tsquery('english', 'search terms')\n"
133
+ "- For relevance-ranked results: ORDER BY ts_rank(p.fts, plainto_tsquery('english', 'search terms')) DESC\n"
134
+ "- FTS handles stemming automatically: 'visualization' matches 'visualizations'\n"
135
+ "- Combine FTS with ILIKE fallback: WHERE p.fts @@ query OR LOWER(p.title) LIKE '%keyword%'\n"
136
  "\n"
137
+ "**When to use tag-only search**: Only if user explicitly mentions 'tagged with' or 'tag:'.\n"
138
+ "**When to use keyword-only search**: For author/organization names.\n"
139
  )
140
 
141
  # ======================
 
172
  "LEFT JOIN post_tags pt ON p.id = pt.post_id "
173
  "LEFT JOIN tags t ON pt.tag_id = t.id "
174
  "LEFT JOIN providers pr ON p.provider_id = pr.id "
175
+ "WHERE (LOWER(t.name) LIKE '%f1%' OR LOWER(t.name) LIKE '%formula%' "
176
+ "OR LOWER(p.title) LIKE '%f1%' OR LOWER(p.title) LIKE '%formula%' "
177
+ "OR LOWER(p.author) LIKE '%f1%') "
178
+ "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
179
  "\nUser: 'Show me posts from The New York Times'\n"
180
  "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
181
  "FROM posts p "
182
  "LEFT JOIN providers pr ON p.provider_id = pr.id "
183
+ "WHERE (LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%') "
184
+ "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
185
+ "\nUser: 'Russia' or 'Show me Russia content'\n"
186
+ "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
187
+ "FROM posts p "
188
+ "LEFT JOIN post_tags pt ON p.id = pt.post_id "
189
+ "LEFT JOIN tags t ON pt.tag_id = t.id "
190
+ "WHERE (LOWER(t.name) LIKE '%russia%' "
191
+ "OR LOWER(p.title) LIKE '%russia%' OR LOWER(p.author) LIKE '%russia%') "
192
+ "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
193
  "\nUser: 'interactive visualizations'\n"
194
  "Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
195
  "FROM posts p "
196
  "LEFT JOIN post_tags pt ON p.id = pt.post_id "
197
  "LEFT JOIN tags t ON pt.tag_id = t.id "
198
+ "WHERE (LOWER(t.name) LIKE '%interactive%' OR LOWER(p.title) LIKE '%interactive%' "
199
+ "OR LOWER(p.title) LIKE '%visualization%' OR LOWER(t.name) LIKE '%dataviz%') "
200
+ "ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
201
  )
202
 
203
  # ======================