Spaces:
Running
Running
Tom
Claude
commited on
Commit
·
d49ed4a
1
Parent(s):
b516494
Improve search: fix tag info, use ILIKE, add FTS, increase limit to 50
Browse files- Fix incorrect tag coverage info (was "3 posts", actually 3,362 posts / 88%)
- Replace word boundary regex with ILIKE for flexible matching
(now "Russia" finds "Russian", "Russia's", etc.)
- Add FTS query guidance with ts_rank() for relevance ranking
- Remove non-existent 'dead' column references from schema
- Increase default result limit from 9 to 50 for better coverage
- Update all SQL examples to use ILIKE patterns
Database migrations applied:
- add_fts_to_posts: Added FTS tsvector column with GIN index
- add_trigram_indexes: Enabled pg_trgm extension for fuzzy matching
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
- src/query_intent_classifier.py +16 -14
- src/vanna.py +39 -26
src/query_intent_classifier.py
CHANGED
|
@@ -165,30 +165,32 @@ class IntentClassifier:
|
|
| 165 |
tags = classification["tags"]
|
| 166 |
|
| 167 |
if intent == QueryIntent.KEYWORD:
|
|
|
|
| 168 |
return f"""
|
| 169 |
Search using KEYWORD approach:
|
| 170 |
- Search terms: {', '.join(keywords)}
|
| 171 |
- Search in: posts.title, posts.author, providers.name
|
| 172 |
-
- Use
|
| 173 |
-
-
|
|
|
|
| 174 |
"""
|
| 175 |
|
| 176 |
elif intent == QueryIntent.TAG:
|
| 177 |
return f"""
|
| 178 |
Search using TAG approach:
|
| 179 |
- Tag names: {', '.join(tags)}
|
| 180 |
-
-
|
|
|
|
| 181 |
- Join with post_tags and tags tables
|
| 182 |
-
- Note: Only a few posts are tagged, results may be limited
|
| 183 |
"""
|
| 184 |
|
| 185 |
else: # HYBRID
|
| 186 |
return f"""
|
| 187 |
-
Search using HYBRID approach:
|
| 188 |
-
-
|
| 189 |
-
-
|
| 190 |
- Use OR logic: tag matches OR keyword matches in title/author
|
| 191 |
-
-
|
| 192 |
|
| 193 |
Recommended SQL pattern:
|
| 194 |
SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
|
|
@@ -196,13 +198,13 @@ FROM posts p
|
|
| 196 |
LEFT JOIN post_tags pt ON p.id = pt.post_id
|
| 197 |
LEFT JOIN tags t ON pt.tag_id = t.id
|
| 198 |
LEFT JOIN providers pr ON p.provider_id = pr.id
|
| 199 |
-
WHERE
|
| 200 |
-
|
| 201 |
-
OR LOWER(p.title) LIKE
|
| 202 |
-
OR LOWER(p.author) LIKE
|
| 203 |
-
|
| 204 |
ORDER BY p.published_date DESC NULLS LAST
|
| 205 |
-
LIMIT
|
| 206 |
"""
|
| 207 |
|
| 208 |
|
|
|
|
| 165 |
tags = classification["tags"]
|
| 166 |
|
| 167 |
if intent == QueryIntent.KEYWORD:
|
| 168 |
+
keyword_example = keywords[0] if keywords else "keyword"
|
| 169 |
return f"""
|
| 170 |
Search using KEYWORD approach:
|
| 171 |
- Search terms: {', '.join(keywords)}
|
| 172 |
- Search in: posts.title, posts.author, providers.name
|
| 173 |
+
- Use LOWER(column) LIKE '%keyword%' for flexible matching
|
| 174 |
+
- Example: LOWER(p.title) LIKE '%{keyword_example}%'
|
| 175 |
+
- This matches word variants: '{keyword_example}', '{keyword_example}n', '{keyword_example}\\'s', etc.
|
| 176 |
"""
|
| 177 |
|
| 178 |
elif intent == QueryIntent.TAG:
|
| 179 |
return f"""
|
| 180 |
Search using TAG approach:
|
| 181 |
- Tag names: {', '.join(tags)}
|
| 182 |
+
- 88% of posts (3,362) have tags - tag search is highly effective!
|
| 183 |
+
- Use LOWER(t.name) LIKE '%tagname%' for flexible matching
|
| 184 |
- Join with post_tags and tags tables
|
|
|
|
| 185 |
"""
|
| 186 |
|
| 187 |
else: # HYBRID
|
| 188 |
return f"""
|
| 189 |
+
Search using HYBRID approach (RECOMMENDED):
|
| 190 |
+
- Tags to search: {', '.join(tags)}
|
| 191 |
+
- Keywords to search: {', '.join(keywords)}
|
| 192 |
- Use OR logic: tag matches OR keyword matches in title/author
|
| 193 |
+
- 88% of posts have tags, so tag search is primary
|
| 194 |
|
| 195 |
Recommended SQL pattern:
|
| 196 |
SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type
|
|
|
|
| 198 |
LEFT JOIN post_tags pt ON p.id = pt.post_id
|
| 199 |
LEFT JOIN tags t ON pt.tag_id = t.id
|
| 200 |
LEFT JOIN providers pr ON p.provider_id = pr.id
|
| 201 |
+
WHERE (
|
| 202 |
+
{' OR '.join(f"LOWER(t.name) LIKE '%{tag}%'" for tag in tags)}
|
| 203 |
+
OR {' OR '.join(f"LOWER(p.title) LIKE '%{kw}%'" for kw in keywords)}
|
| 204 |
+
OR {' OR '.join(f"LOWER(p.author) LIKE '%{kw}%'" for kw in keywords)}
|
| 205 |
+
)
|
| 206 |
ORDER BY p.published_date DESC NULLS LAST
|
| 207 |
+
LIMIT 50
|
| 208 |
"""
|
| 209 |
|
| 210 |
|
src/vanna.py
CHANGED
|
@@ -68,7 +68,7 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
|
|
| 68 |
prompt += (
|
| 69 |
"\n## Database Schema\n"
|
| 70 |
"Tables:\n"
|
| 71 |
-
"- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at,
|
| 72 |
"- providers (id, name)\n"
|
| 73 |
"- provider_attributes (id, provider_id, type, name)\n"
|
| 74 |
"- post_provider_attributes (post_id, attribute_id)\n"
|
|
@@ -96,7 +96,7 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
|
|
| 96 |
"- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
|
| 97 |
"- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
|
| 98 |
"- `post_tags.weight`: relevance score between a post and a tag.\n"
|
| 99 |
-
"- `posts.
|
| 100 |
)
|
| 101 |
|
| 102 |
# ======================
|
|
@@ -104,33 +104,38 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
|
|
| 104 |
# ======================
|
| 105 |
prompt += (
|
| 106 |
"\n## Business Logic\n"
|
| 107 |
-
"- **ALWAYS filter out dead posts**: Include `WHERE p.dead = false` (or `AND p.dead = false`) in every query. Never return posts where dead = true.\n"
|
| 108 |
"- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
|
| 109 |
"- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
|
| 110 |
"- Tags link posts to specific themes or disciplines.\n"
|
| 111 |
"- A single post may have multiple tags, awards, or categories.\n"
|
| 112 |
"- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
|
| 113 |
"- If the user says 'recently', filter posts from the last 90 days.\n"
|
| 114 |
-
"-
|
| 115 |
"\n"
|
| 116 |
-
"##
|
| 117 |
-
"**
|
|
|
|
|
|
|
| 118 |
"\n"
|
| 119 |
"**Hybrid Search Approach (RECOMMENDED)**:\n"
|
| 120 |
-
"-
|
| 121 |
-
"- Use LEFT JOINs for tags
|
| 122 |
"\n"
|
| 123 |
-
"**Keyword Matching - Use
|
| 124 |
-
"- Use
|
| 125 |
-
"-
|
| 126 |
-
"-
|
| 127 |
-
"-
|
| 128 |
-
"- This ensures exact word matching, not substring matching\n"
|
| 129 |
"\n"
|
| 130 |
-
"**
|
| 131 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
"\n"
|
| 133 |
-
"
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
# ======================
|
|
@@ -167,24 +172,32 @@ class CustomSQLSystemPromptBuilder(SystemPromptBuilder):
|
|
| 167 |
"LEFT JOIN post_tags pt ON p.id = pt.post_id "
|
| 168 |
"LEFT JOIN tags t ON pt.tag_id = t.id "
|
| 169 |
"LEFT JOIN providers pr ON p.provider_id = pr.id "
|
| 170 |
-
"WHERE
|
| 171 |
-
"OR p.title
|
| 172 |
-
"OR p.author
|
| 173 |
-
"ORDER BY p.published_date DESC NULLS LAST LIMIT
|
| 174 |
"\nUser: 'Show me posts from The New York Times'\n"
|
| 175 |
"Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
|
| 176 |
"FROM posts p "
|
| 177 |
"LEFT JOIN providers pr ON p.provider_id = pr.id "
|
| 178 |
-
"WHERE
|
| 179 |
-
"ORDER BY p.published_date DESC NULLS LAST LIMIT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
"\nUser: 'interactive visualizations'\n"
|
| 181 |
"Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
|
| 182 |
"FROM posts p "
|
| 183 |
"LEFT JOIN post_tags pt ON p.id = pt.post_id "
|
| 184 |
"LEFT JOIN tags t ON pt.tag_id = t.id "
|
| 185 |
-
"WHERE
|
| 186 |
-
"OR p.title
|
| 187 |
-
"ORDER BY p.published_date DESC NULLS LAST LIMIT
|
| 188 |
)
|
| 189 |
|
| 190 |
# ======================
|
|
|
|
| 68 |
prompt += (
|
| 69 |
"\n## Database Schema\n"
|
| 70 |
"Tables:\n"
|
| 71 |
+
"- posts (id, title, source_url, author, published_date, image_url, type, provider_id, created_at, updated_at, content_markdown, fts)\n"
|
| 72 |
"- providers (id, name)\n"
|
| 73 |
"- provider_attributes (id, provider_id, type, name)\n"
|
| 74 |
"- post_provider_attributes (post_id, attribute_id)\n"
|
|
|
|
| 96 |
"- `providers.name`: name of the publishing organization (e.g., 'Nuanced', 'SND').\n"
|
| 97 |
"- `tags.name`: thematic keyword or topic (e.g., '3D', 'AI', 'Design').\n"
|
| 98 |
"- `post_tags.weight`: relevance score between a post and a tag.\n"
|
| 99 |
+
"- `posts.fts`: tsvector column for full-text search (auto-generated from title and author).\n"
|
| 100 |
)
|
| 101 |
|
| 102 |
# ======================
|
|
|
|
| 104 |
# ======================
|
| 105 |
prompt += (
|
| 106 |
"\n## Business Logic\n"
|
|
|
|
| 107 |
"- A query mentioning an organization (e.g., 'New York Times') should search both `posts.author` and `providers.name`.\n"
|
| 108 |
"- Return all post types (spotlight, resource, insight) unless the user specifies otherwise.\n"
|
| 109 |
"- Tags link posts to specific themes or disciplines.\n"
|
| 110 |
"- A single post may have multiple tags, awards, or categories.\n"
|
| 111 |
"- If the user mentions a year (e.g., 'in 2021'), filter with `EXTRACT(YEAR FROM published_date) = 2021`.\n"
|
| 112 |
"- If the user says 'recently', filter posts from the last 90 days.\n"
|
| 113 |
+
"- Default limit is 50 rows for search results. Use OFFSET for pagination if needed.\n"
|
| 114 |
"\n"
|
| 115 |
+
"## Search Strategy\n"
|
| 116 |
+
"**TAG COVERAGE**: 3,362 posts (88%) have tags. Tag-based search is highly effective!\n"
|
| 117 |
+
"- 9,105 tags available including countries (russia, china, usa), topics (climate change, politics), and formats (interactive, dataviz)\n"
|
| 118 |
+
"- Use tag matching as PRIMARY search for topic-based queries\n"
|
| 119 |
"\n"
|
| 120 |
"**Hybrid Search Approach (RECOMMENDED)**:\n"
|
| 121 |
+
"- Combine tag search AND keyword search with OR logic for maximum coverage\n"
|
| 122 |
+
"- Use LEFT JOINs for tags to also include the 12% of untagged posts\n"
|
| 123 |
"\n"
|
| 124 |
+
"**Keyword Matching - Use ILIKE for Flexible Matching**:\n"
|
| 125 |
+
"- Use LOWER(column) LIKE '%keyword%' for case-insensitive substring matching\n"
|
| 126 |
+
"- Example: LOWER(p.title) LIKE '%russia%' matches 'Russia', 'Russian', 'Russia\\'s', etc.\n"
|
| 127 |
+
"- This ensures word variants are captured (much better than exact word boundary matching)\n"
|
| 128 |
+
"- For multi-word searches: LOWER(p.title) LIKE '%new york%'\n"
|
|
|
|
| 129 |
"\n"
|
| 130 |
+
"**Full-Text Search (for relevance ranking)**:\n"
|
| 131 |
+
"- The posts table has an 'fts' column (tsvector) for full-text search\n"
|
| 132 |
+
"- Use: p.fts @@ plainto_tsquery('english', 'search terms')\n"
|
| 133 |
+
"- For relevance-ranked results: ORDER BY ts_rank(p.fts, plainto_tsquery('english', 'search terms')) DESC\n"
|
| 134 |
+
"- FTS handles stemming automatically: 'visualization' matches 'visualizations'\n"
|
| 135 |
+
"- Combine FTS with ILIKE fallback: WHERE p.fts @@ query OR LOWER(p.title) LIKE '%keyword%'\n"
|
| 136 |
"\n"
|
| 137 |
+
"**When to use tag-only search**: Only if user explicitly mentions 'tagged with' or 'tag:'.\n"
|
| 138 |
+
"**When to use keyword-only search**: For author/organization names.\n"
|
| 139 |
)
|
| 140 |
|
| 141 |
# ======================
|
|
|
|
| 172 |
"LEFT JOIN post_tags pt ON p.id = pt.post_id "
|
| 173 |
"LEFT JOIN tags t ON pt.tag_id = t.id "
|
| 174 |
"LEFT JOIN providers pr ON p.provider_id = pr.id "
|
| 175 |
+
"WHERE (LOWER(t.name) LIKE '%f1%' OR LOWER(t.name) LIKE '%formula%' "
|
| 176 |
+
"OR LOWER(p.title) LIKE '%f1%' OR LOWER(p.title) LIKE '%formula%' "
|
| 177 |
+
"OR LOWER(p.author) LIKE '%f1%') "
|
| 178 |
+
"ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
|
| 179 |
"\nUser: 'Show me posts from The New York Times'\n"
|
| 180 |
"Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
|
| 181 |
"FROM posts p "
|
| 182 |
"LEFT JOIN providers pr ON p.provider_id = pr.id "
|
| 183 |
+
"WHERE (LOWER(p.author) LIKE '%new york times%' OR LOWER(pr.name) LIKE '%new york times%') "
|
| 184 |
+
"ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
|
| 185 |
+
"\nUser: 'Russia' or 'Show me Russia content'\n"
|
| 186 |
+
"Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
|
| 187 |
+
"FROM posts p "
|
| 188 |
+
"LEFT JOIN post_tags pt ON p.id = pt.post_id "
|
| 189 |
+
"LEFT JOIN tags t ON pt.tag_id = t.id "
|
| 190 |
+
"WHERE (LOWER(t.name) LIKE '%russia%' "
|
| 191 |
+
"OR LOWER(p.title) LIKE '%russia%' OR LOWER(p.author) LIKE '%russia%') "
|
| 192 |
+
"ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
|
| 193 |
"\nUser: 'interactive visualizations'\n"
|
| 194 |
"Assistant: [call run_sql with \"SELECT DISTINCT p.id, p.title, p.source_url, p.author, p.published_date, p.image_url, p.type "
|
| 195 |
"FROM posts p "
|
| 196 |
"LEFT JOIN post_tags pt ON p.id = pt.post_id "
|
| 197 |
"LEFT JOIN tags t ON pt.tag_id = t.id "
|
| 198 |
+
"WHERE (LOWER(t.name) LIKE '%interactive%' OR LOWER(p.title) LIKE '%interactive%' "
|
| 199 |
+
"OR LOWER(p.title) LIKE '%visualization%' OR LOWER(t.name) LIKE '%dataviz%') "
|
| 200 |
+
"ORDER BY p.published_date DESC NULLS LAST LIMIT 50;\"]\n"
|
| 201 |
)
|
| 202 |
|
| 203 |
# ======================
|