diff --git a/course-matrix/backend/src/constants/constants.ts b/course-matrix/backend/src/constants/constants.ts index a8a7cf83..c5197739 100644 --- a/course-matrix/backend/src/constants/constants.ts +++ b/course-matrix/backend/src/constants/constants.ts @@ -39,7 +39,7 @@ export const yearToCode = (year: number) => { // Set minimum results wanted for a similarity search on the associated namespace. export const namespaceToMinResults = new Map(); -namespaceToMinResults.set("courses_v2", 10); +namespaceToMinResults.set("courses_v3", 10); namespaceToMinResults.set("offerings", 16); // Typically, more offering info is wanted. namespaceToMinResults.set("prerequisites", 5); namespaceToMinResults.set("corequisites", 5); diff --git a/course-matrix/backend/src/constants/promptKeywords.ts b/course-matrix/backend/src/constants/promptKeywords.ts index 6b71ffd6..5ffc3931 100644 --- a/course-matrix/backend/src/constants/promptKeywords.ts +++ b/course-matrix/backend/src/constants/promptKeywords.ts @@ -1,6 +1,6 @@ // Keywords related to each namespace export const NAMESPACE_KEYWORDS = { - courses_v2: [ + courses_v3: [ "course", "class", "description", @@ -61,6 +61,41 @@ export const NAMESPACE_KEYWORDS = { programs: ["program", "major", "minor", "specialist", "degree", "stream"], }; +export const BREADTH_REQUIREMENT_KEYWORDS = { + ART_LIT_LANG: [ + "ART_LIT_LANG", + "art literature", + "arts literature", + "art language", + "arts language", + "literature language", + "art literature language", + "arts literature language", + ], + HIS_PHIL_CUL: [ + "HIS_PHIL_CUL", + "history philosophy culture", + "history, philosophy, culture", + "history, philosophy, and culture", + "history, philosophy", + "history philosophy", + "philosophy culture", + "philosophy, culture", + "history culture", + "History, Philosophy and Cultural Studies", + ], + SOCIAL_SCI: ["SOCIAL_SCI", "social science", "social sciences"], + NAT_SCI: ["NAT_SCI", "natural science", "natural sciences"], + QUANT: ["QUANT", "quantitative reasoning"], +}; + +export const YEAR_LEVEL_KEYWORDS = { + first_year: ["first year", "first-year", "A-level", "A level", "1st year"], + second_year: ["second year", "second-year", "B-level", "B level", "2nd year"], + third_year: ["third year", "third-year", "C-level", "C level", "3rd year"], + fourth_year: ["fourth year", "fourth-year", "D-level", "D level", "4th year"], +}; + // General academic terms that might indicate a search is needed export const GENERAL_ACADEMIC_TERMS = ["credit", "enroll", "drop"]; diff --git a/course-matrix/backend/src/controllers/aiController.ts b/course-matrix/backend/src/controllers/aiController.ts index 15734728..678ae6e8 100644 --- a/course-matrix/backend/src/controllers/aiController.ts +++ b/course-matrix/backend/src/controllers/aiController.ts @@ -12,10 +12,14 @@ import { DEPARTMENT_CODES, ASSISTANT_TERMS, USEFUL_INFO, + BREADTH_REQUIREMENT_KEYWORDS, + YEAR_LEVEL_KEYWORDS, } from "../constants/promptKeywords"; import { CHATBOT_MEMORY_THRESHOLD, codeToYear } from "../constants/constants"; import { namespaceToMinResults } from "../constants/constants"; import OpenAI from "openai"; +import { convertBreadthRequirement } from "../utils/convert-breadth-requirement"; +import { convertYearLevel } from "../utils/convert-year-level"; const openai = createOpenAI({ baseURL: process.env.OPENAI_BASE_URL, @@ -58,8 +62,8 @@ function analyzeQuery(query: string): { // If a course code is detected, add tehse namespaces if (containsCourseCode) { - if (!relevantNamespaces.includes("courses_v2")) - relevantNamespaces.push("courses_v2"); + if (!relevantNamespaces.includes("courses_v3")) + relevantNamespaces.push("courses_v3"); if (!relevantNamespaces.includes("offerings")) relevantNamespaces.push("offerings"); if (!relevantNamespaces.includes("prerequisites")) @@ -70,8 +74,8 @@ function analyzeQuery(query: string): { if (DEPARTMENT_CODES.some((code) => lowerQuery.includes(code))) { if (!relevantNamespaces.includes("departments")) relevantNamespaces.push("departments"); - if (!relevantNamespaces.includes("courses_v2")) - relevantNamespaces.push("courses_v2"); + if (!relevantNamespaces.includes("courses_v3")) + relevantNamespaces.push("courses_v3"); } // If search is required at all @@ -83,7 +87,7 @@ function analyzeQuery(query: string): { // If no specific namespaces identified & search required, then search all if (requiresSearch && relevantNamespaces.length === 0) { relevantNamespaces.push( - "courses_v2", + "courses_v3", "offerings", "prerequisites", "corequisites", @@ -106,6 +110,7 @@ async function searchSelectedNamespaces( query: string, k: number, namespaces: string[], + filters?: Object, ): Promise { let allResults: Document[] = []; @@ -127,6 +132,7 @@ async function searchSelectedNamespaces( const results = await namespaceStore.similaritySearch( query, Math.max(k, namespaceToMinResults.get(namespace)), + namespace === "courses_v3" ? filters : undefined, ); console.log(`Found ${results.length} results in namespace: ${namespace}`); allResults = [...allResults, ...results]; @@ -172,16 +178,18 @@ async function reformulateQuery( - DO replace pronouns and references with specific names and identifiers - DO include course codes, names and specific details for academic entities - If the query is not about university courses & offerings, return exactly a copy of the user's query. + - Append "code: " before course codes For example: "CSCC01" -> "code: CSCC01" + - If a course year level is written as "first year", "second year", etc. Then replace "first" with "1st" and "second" with "2nd" etc. Examples: User: "When is it offered?" - Output: "When is CSCA48 Introduction to Computer Science offered in the 2024-2025 academic year?" + Output: "When is CSCA48 offered in the 2024-2025 academic year?" User: "Tell me more about that" - Output: "What are the details, descriptions, and requirements for MATA31 Calculus I?" + Output: "What are the details, descriptions, and requirements for MATA31?" User: "Who teaches it?" - Output: "Who are the instructors for MGEA02 Introduction to Microeconomics at UTSC?" + Output: "Who are the instructors for MGEA02 at UTSC?" User: "What are the course names of those codes?" Output: "What are the course names of course codes: MGTA01, CSCA08, MATA31, MATA35?" @@ -192,8 +200,13 @@ async function reformulateQuery( User: "Give 2nd year math courses." Output: "What are some 2nd year math courses?" - User: "Give first year math courses." - Output: "What are some 1st year math courses?"`, + User: "Give third year math courses." + Output: "What are some 3rd year math courses?" + + User: "What breadth requirement does CSCC01 satisfy?" + Output: "What breadth requirement does code: CSCC01 satisfy?" + + `, }, ]; @@ -227,6 +240,69 @@ async function reformulateQuery( } } +// Determines whether to apply metadata filtering based on user query. +function includeFilters(query: string) { + const lowerQuery = query.toLocaleLowerCase(); + const relaventBreadthRequirements: string[] = []; + const relaventYearLevels: string[] = []; + + Object.entries(BREADTH_REQUIREMENT_KEYWORDS).forEach( + ([namespace, keywords]) => { + if (keywords.some((keyword) => lowerQuery.includes(keyword))) { + relaventBreadthRequirements.push(convertBreadthRequirement(namespace)); + } + }, + ); + + Object.entries(YEAR_LEVEL_KEYWORDS).forEach(([namespace, keywords]) => { + if (keywords.some((keyword) => lowerQuery.includes(keyword))) { + relaventYearLevels.push(convertYearLevel(namespace)); + } + }); + + let filter = {}; + if (relaventBreadthRequirements.length > 0 && relaventYearLevels.length > 0) { + filter = { + $and: [ + { + $or: relaventBreadthRequirements.map((req) => ({ + breadth_requirement: { $eq: req }, + })), + }, + { + $or: relaventYearLevels.map((yl) => ({ year_level: { $eq: yl } })), + }, + ], + }; + } else if (relaventBreadthRequirements.length > 0) { + filter = { + $or: relaventBreadthRequirements.map((req) => ({ + breadth_requirement: { $eq: req }, + })), + }; + } else if (relaventYearLevels.length > 0) { + filter = { + $or: relaventYearLevels.map((yl) => ({ year_level: { $eq: yl } })), + }; + } + return filter; +} + +/** + * @description Handles user queries and generates responses using GPT-4o, with optional knowledge retrieval. + * + * @param {Request} req - The Express request object, containing: + * @param {Object[]} req.body.messages - Array of message objects representing the conversation history. + * @param {string} req.body.messages[].role - The role of the message sender (e.g., "user", "assistant"). + * @param {Object[]} req.body.messages[].content - An array containing message content objects. + * @param {string} req.body.messages[].content[].text - The actual text of the message. + * + * @param {Response} res - The Express response object used to stream the generated response. + * + * @returns {void} Responds with a streamed text response of the AI output + * + * @throws {Error} If query reformulation or knowledge retrieval fails. + */ export const chat = asyncHandler(async (req: Request, res: Response) => { const { messages } = req.body; const latestMessage = messages[messages.length - 1].content[0].text; @@ -258,11 +334,15 @@ export const chat = asyncHandler(async (req: Request, res: Response) => { )}`, ); + const filters = includeFilters(reformulatedQuery); + // console.log("Filters: ", JSON.stringify(filters)) + // Search only relevant namespaces const searchResults = await searchSelectedNamespaces( reformulatedQuery, 3, relevantNamespaces, + Object.keys(filters).length === 0 ? undefined : filters, ); // console.log("Search Results: ", searchResults); @@ -274,7 +354,7 @@ export const chat = asyncHandler(async (req: Request, res: Response) => { console.log("Query does not require knowledge retrieval, skipping search"); } - // console.log("CONTEXT: ", context); + console.log("CONTEXT: ", context); const result = streamText({ model: openai("gpt-4o-mini"), diff --git a/course-matrix/backend/src/routes/aiRouter.ts b/course-matrix/backend/src/routes/aiRouter.ts index f6c4def1..754589bf 100644 --- a/course-matrix/backend/src/routes/aiRouter.ts +++ b/course-matrix/backend/src/routes/aiRouter.ts @@ -4,5 +4,13 @@ import { authRouter } from "./authRouter"; export const aiRouter = express.Router(); +/** + * @route POST /api/ai/chat + * @description Handles user queries and generates responses using GPT-4o, with optional knowledge retrieval. + */ aiRouter.post("/chat", authRouter, chat); +/** + * @route POST /api/ai/test-similarity-search + * @description Test vector database similarity search feature + */ aiRouter.post("/test-similarity-search", testSimilaritySearch); diff --git a/course-matrix/backend/src/utils/convert-breadth-requirement.ts b/course-matrix/backend/src/utils/convert-breadth-requirement.ts new file mode 100644 index 00000000..0537ea3b --- /dev/null +++ b/course-matrix/backend/src/utils/convert-breadth-requirement.ts @@ -0,0 +1,9 @@ +export const convertBreadthRequirement = (code: string) => { + if (code === "ART_LIT_LANG") return "Arts, Literature and Language"; + else if (code === "HIS_PHIL_CUL") + return "History, Philosophy and Cultural Studies"; + else if (code === "SOCIAL_SCI") return "Social and Behavioral Sciences"; + else if (code === "NAT_SCI") return "Natural Sciences"; + else if (code === "QUANT") return "Quantitative Reasoning"; + else return ""; +}; diff --git a/course-matrix/backend/src/utils/convert-year-level.ts b/course-matrix/backend/src/utils/convert-year-level.ts new file mode 100644 index 00000000..7be7ad17 --- /dev/null +++ b/course-matrix/backend/src/utils/convert-year-level.ts @@ -0,0 +1,7 @@ +export const convertYearLevel = (code: string) => { + if (code === "first_year") return "1st year"; + else if (code === "second_year") return "2nd year"; + else if (code === "third_year") return "3rd year"; + else if (code === "fourth_year") return "4th year"; + else return ""; +}; diff --git a/course-matrix/backend/src/utils/embeddings.ts b/course-matrix/backend/src/utils/embeddings.ts index a8f11569..25d56fde 100644 --- a/course-matrix/backend/src/utils/embeddings.ts +++ b/course-matrix/backend/src/utils/embeddings.ts @@ -5,6 +5,7 @@ import { PineconeStore } from "@langchain/pinecone"; import { Pinecone } from "@pinecone-database/pinecone"; import config from "../config/config"; import path from "path"; +import { convertBreadthRequirement } from "./convert-breadth-requirement"; console.log("Running embeddings process..."); @@ -37,6 +38,35 @@ async function processCSV(filePath: string, namespace: string) { }); } +// Generate embeddings for courses.csv +async function processCoursesCSV(filePath: string, namespace: string) { + const fileName = path.basename(filePath); + const loader = new CSVLoader(filePath); + let docs = await loader.load(); + + docs = docs.map((doc, index) => ({ + ...doc, + metadata: { + ...doc.metadata, + source: fileName, + row: index + 1, + breadth_requirement: convertBreadthRequirement( + doc.pageContent.split("\n")[1].split(": ")[1], + ), + year_level: doc.pageContent.split("\n")[10].split(": ")[1], + }, + })); + console.log("Sample doc: ", docs[0]); + + const index = pinecone.Index(process.env.PINECONE_INDEX_NAME!); + + // Store each row as an individual embedding + await PineconeStore.fromDocuments(docs, embeddings, { + pineconeIndex: index as any, + namespace: namespace, + }); +} + // Generate embeddings for pdfs async function processPDF(filePath: string, namespace: string) { const fileName = path.basename(filePath); @@ -98,6 +128,7 @@ async function processPDF(filePath: string, namespace: string) { // processCSV("../data/tables/offerings_winter_2026.csv", "offerings") // processCSV("../data/tables/departments.csv", "departments") // processCSV("../data/tables/courses_with_year.csv", "courses_v2") +// processCoursesCSV("../data/tables/courses_with_year.csv", "courses_v3"); console.log("embeddings done.");