214 lines
7.3 KiB
Python
214 lines
7.3 KiB
Python
"""
|
|
Tests for keyword extraction and improved hybrid search
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from src.server.services.search.keyword_extractor import (
|
|
KeywordExtractor,
|
|
build_search_terms,
|
|
extract_keywords,
|
|
)
|
|
|
|
|
|
class TestKeywordExtractor:
|
|
"""Test keyword extraction functionality"""
|
|
|
|
@pytest.fixture
|
|
def extractor(self):
|
|
return KeywordExtractor()
|
|
|
|
def test_simple_keyword_extraction(self, extractor):
|
|
"""Test extraction from simple queries"""
|
|
query = "Supabase authentication"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
assert "supabase" in keywords
|
|
assert "authentication" in keywords
|
|
assert len(keywords) >= 2
|
|
|
|
def test_complex_query_extraction(self, extractor):
|
|
"""Test extraction from complex queries"""
|
|
query = "Supabase auth flow best practices"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
assert "supabase" in keywords
|
|
assert "auth" in keywords
|
|
assert "flow" in keywords
|
|
assert "best_practices" in keywords or "practices" in keywords
|
|
|
|
def test_stop_word_filtering(self, extractor):
|
|
"""Test that stop words are filtered out"""
|
|
query = "How to use the React component with the database"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
# Stop words should be filtered
|
|
assert "how" not in keywords
|
|
assert "to" not in keywords
|
|
assert "the" not in keywords
|
|
assert "with" not in keywords
|
|
|
|
# Technical terms should remain
|
|
assert "react" in keywords
|
|
assert "component" in keywords
|
|
assert "database" in keywords
|
|
|
|
def test_technical_terms_preserved(self, extractor):
|
|
"""Test that technical terms are preserved"""
|
|
query = "PostgreSQL full-text search with Python API"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
assert "postgresql" in keywords or "postgres" in keywords
|
|
assert "python" in keywords
|
|
assert "api" in keywords
|
|
|
|
def test_compound_terms(self, extractor):
|
|
"""Test compound term detection"""
|
|
query = "best practices for real-time websocket connections"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
# Should detect compound terms
|
|
assert "best_practices" in keywords
|
|
assert "realtime" in keywords or "real-time" in keywords
|
|
assert "websocket" in keywords
|
|
|
|
def test_empty_query(self, extractor):
|
|
"""Test handling of empty query"""
|
|
keywords = extractor.extract_keywords("")
|
|
assert keywords == []
|
|
|
|
def test_query_with_only_stopwords(self, extractor):
|
|
"""Test query with only stop words"""
|
|
query = "the and with for in"
|
|
keywords = extractor.extract_keywords(query)
|
|
assert keywords == []
|
|
|
|
def test_keyword_prioritization(self, extractor):
|
|
"""Test that keywords are properly prioritized"""
|
|
query = "Python Python Django REST API framework Python"
|
|
keywords = extractor.extract_keywords(query)
|
|
|
|
# Python appears 3 times, should be prioritized
|
|
assert keywords[0] == "python"
|
|
|
|
# Technical terms should be high priority
|
|
assert "django" in keywords[:3]
|
|
assert "api" in keywords[:5] # API should be in top 5
|
|
|
|
def test_max_keywords_limit(self, extractor):
|
|
"""Test that max_keywords parameter is respected"""
|
|
query = "Python Django Flask FastAPI React Vue Angular TypeScript JavaScript HTML CSS"
|
|
keywords = extractor.extract_keywords(query, max_keywords=5)
|
|
|
|
assert len(keywords) <= 5
|
|
# Most important terms should be included
|
|
assert "python" in keywords
|
|
assert "django" in keywords
|
|
|
|
def test_min_length_filtering(self, extractor):
|
|
"""Test minimum length filtering"""
|
|
query = "a b c API JWT DB SQL"
|
|
keywords = extractor.extract_keywords(query, min_length=3)
|
|
|
|
# Single letters should be filtered
|
|
assert "a" not in keywords
|
|
assert "b" not in keywords
|
|
assert "c" not in keywords
|
|
|
|
# 3+ letter terms should remain
|
|
assert "api" in keywords
|
|
assert "jwt" in keywords
|
|
assert "sql" in keywords
|
|
|
|
|
|
class TestSearchTermBuilder:
|
|
"""Test search term building with variations"""
|
|
|
|
def test_plural_variations(self):
|
|
"""Test plural/singular variations"""
|
|
keywords = ["functions", "class", "error"]
|
|
terms = build_search_terms(keywords)
|
|
|
|
# Should include singular of "functions"
|
|
assert "function" in terms
|
|
# Should include plural of "class"
|
|
assert "classes" in terms
|
|
# Should include plural of "error"
|
|
assert "errors" in terms
|
|
|
|
def test_verb_variations(self):
|
|
"""Test verb form variations"""
|
|
keywords = ["creating", "updated", "testing"]
|
|
terms = build_search_terms(keywords)
|
|
|
|
# Should generate base forms
|
|
assert "create" in terms or "creat" in terms
|
|
assert "update" in terms or "updat" in terms
|
|
assert "test" in terms
|
|
|
|
def test_no_duplicates(self):
|
|
"""Test that duplicates are removed"""
|
|
keywords = ["test", "tests", "testing"]
|
|
terms = build_search_terms(keywords)
|
|
|
|
# Should have unique terms only
|
|
assert len(terms) == len(set(terms))
|
|
|
|
|
|
class TestIntegration:
|
|
"""Integration tests for keyword extraction in search context"""
|
|
|
|
def test_real_world_query_1(self):
|
|
"""Test with real-world query example 1"""
|
|
query = "How to implement JWT authentication in FastAPI with Supabase"
|
|
keywords = extract_keywords(query)
|
|
|
|
# Should extract the key technical terms
|
|
assert "jwt" in keywords
|
|
assert "authentication" in keywords
|
|
assert "fastapi" in keywords
|
|
assert "supabase" in keywords
|
|
|
|
# Should not include generic words (implement is now filtered as technical stop word)
|
|
assert "how" not in keywords
|
|
assert "to" not in keywords
|
|
|
|
def test_real_world_query_2(self):
|
|
"""Test with real-world query example 2"""
|
|
query = "PostgreSQL full text search vs Elasticsearch performance comparison"
|
|
keywords = extract_keywords(query)
|
|
|
|
assert "postgresql" in keywords or "postgres" in keywords
|
|
assert "elasticsearch" in keywords
|
|
assert "performance" in keywords
|
|
assert "comparison" in keywords
|
|
|
|
# Should handle "full text" as compound or separate
|
|
assert "fulltext" in keywords or ("full" in keywords and "text" in keywords)
|
|
|
|
def test_real_world_query_3(self):
|
|
"""Test with real-world query example 3"""
|
|
query = "debugging async await issues in Node.js Express middleware"
|
|
keywords = extract_keywords(query)
|
|
|
|
assert "debugging" in keywords or "debug" in keywords
|
|
assert "async" in keywords
|
|
assert "await" in keywords
|
|
assert "express" in keywords
|
|
assert "middleware" in keywords
|
|
|
|
# Node.js might be split
|
|
assert "nodejs" in keywords or "node" in keywords
|
|
|
|
def test_code_related_query(self):
|
|
"""Test with code-related query"""
|
|
query = "TypeError cannot read property undefined JavaScript React hooks"
|
|
keywords = extract_keywords(query)
|
|
|
|
assert "typeerror" in keywords or "type" in keywords
|
|
assert "property" in keywords
|
|
assert "undefined" in keywords
|
|
assert "javascript" in keywords
|
|
assert "react" in keywords
|
|
assert "hooks" in keywords
|