pgvector returning less than it should
export const querySimilarTechnologies = async (
inputSkill: string,
topK: number = 10000,
) => {
try {
console.log(
`[1] Starting search for similar technologies to: "${inputSkill}"`,
);
// Step 1: Generate embedding for the input skill
const embedding = await getEmbedding(inputSkill);
console.log(`[2] Embedding generated for: "${inputSkill}"`);
// Step 2: Perform similarity search directly in PostgreSQL
const similarity = sql<number>`1 - (${cosineDistance(skills.vector, embedding)})`;
const similarSkills = await db
.select({
technology: skills.skill,
similarity,
})
.from(skills)
.orderBy(cosineDistance(skills.vector, embedding))
.limit(topK);
console.log(
`[3] Retrieved ${similarSkills.length} similar technologies after similarity search.`,
);
// Optional: Filter based on a threshold if necessary
/*
const threshold = 0.7;
const filteredSimilarities = similarSkills.filter(s => s.similarity >= threshold);
console.log(`[4] Found ${filteredSimilarities.length} similar technologies after filtering.`);
*/
// Return the similar technologies with similarity scores
const result = similarSkills.map((s) => ({
technology: s.technology,
score: parseFloat(s.similarity.toFixed(6)),
}));
console.log(`[5] Returning ${result.length} similar technologies.`);
return result;
} catch (error) {
console.error("Error querying similar technologies:", error);
return [];
}
};
export const querySimilarTechnologies = async (
inputSkill: string,
topK: number = 10000,
) => {
try {
console.log(
`[1] Starting search for similar technologies to: "${inputSkill}"`,
);
// Step 1: Generate embedding for the input skill
const embedding = await getEmbedding(inputSkill);
console.log(`[2] Embedding generated for: "${inputSkill}"`);
// Step 2: Perform similarity search directly in PostgreSQL
const similarity = sql<number>`1 - (${cosineDistance(skills.vector, embedding)})`;
const similarSkills = await db
.select({
technology: skills.skill,
similarity,
})
.from(skills)
.orderBy(cosineDistance(skills.vector, embedding))
.limit(topK);
console.log(
`[3] Retrieved ${similarSkills.length} similar technologies after similarity search.`,
);
// Optional: Filter based on a threshold if necessary
/*
const threshold = 0.7;
const filteredSimilarities = similarSkills.filter(s => s.similarity >= threshold);
console.log(`[4] Found ${filteredSimilarities.length} similar technologies after filtering.`);
*/
// Return the similar technologies with similarity scores
const result = similarSkills.map((s) => ({
technology: s.technology,
score: parseFloat(s.similarity.toFixed(6)),
}));
console.log(`[5] Returning ${result.length} similar technologies.`);
return result;
} catch (error) {
console.error("Error querying similar technologies:", error);
return [];
}
};
1 Reply
This should be returning 10,000. The table has 1,200,000 entries. However, it is cutting off after 390 for some reason. This is obviously a huge problem I need fixed. I am using neondb for context, and I'm not sure if this is something to take up with them or is it a drizzle thing
any help would be very much appreciated!