introduced 2 opanAI model to get the balance between speed and accuracy

HarrySu123 · HarrySu123 · commit 302af9603d35 · 2025-07-14T14:33:23.000+01:00
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -294,8 +294,20 @@
    "outputs": [],
    "source": [
     "# Set up the LLM via LangChain.\n",
-    "llm = ChatOpenAI(\n",
-    "            model=os.environ['OPENAI_MODEL'],\n",
+    "\n",
+    "# Uses gpt-4.1-nano:\n",
+    "#    - a faster model\n",
+    "#    - less intelligent\n",
+    "\n",
+    "llm_nano = ChatOpenAI(\n",
+    "            model=\"gpt-4.1-nano\",\n",
+    "            api_key=os.environ[\"OPENAI_API_KEY\"],\n",
+    "        )\n",
+    "\n",
+    "# Uses gpt-4o-mini:\n",
+    "#    - more intelligent\n",
+    "llm_mini = ChatOpenAI(\n",
+    "            model=\"gpt-4o-mini\",\n",
     "            api_key=os.environ[\"OPENAI_API_KEY\"],\n",
     "        )"
    ]
@@ -338,7 +350,7 @@
     "        Return the markdown now.\n",
     "    \"\"\"\n",
     "\n",
-    "    response = llm.invoke(prompt)\n",
+    "    response = llm_nano.invoke(prompt)\n",
     "\n",
     "    return response.content.strip()"
    ]
@@ -387,12 +399,13 @@
     "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately\n",
     "            - NO markdown code blocks, NO extra text, NO explanations.\n",
     "            - Use plain newlines (not escaped as `\\n`).\n",
+    "            - In JSON strings, backslashes must be escaped. Use \\\\\\\\ for LaTeX backslashes.\n",
     "            - Always have each field in the JSON, even if it is empty.\n",
     "            - Becareful that the last element of a list is not followed by a comma.\n",
-    "        6. The Text inside the JSON should be in Texdown:\n",
+    "        6. The Text inside the JSON should be in Lexdown:\n",
     "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
-    "            2. do not remove or collapse blank lines.\n",
-    "            3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "            2. Do not remove or collapse blank lines.\n",
+    "            3. Do not escape characters like `\\n` or `\\\\` except for JSON requirements.\n",
     "    \"\"\"\n",
     "\n",
     "def extract_questions(doc_page_content: str) -> dict:\n",
@@ -416,7 +429,7 @@
     "    for attempt_idx in range(3):\n",
     "        \n",
     "        # Call the LLM\n",
-    "        response = llm.invoke(prompt)\n",
+    "        response = llm_mini.invoke(prompt)\n",
     "\n",
     "        # Debug: print the raw LLM response\n",
     "        # print(\"Raw LLM Response:\")\n",
@@ -431,9 +444,7 @@
     "            return parsed_output.model_dump()\n",
     "        except Exception as e:\n",
     "            print(\"Error parsing LLM response as JSON:\")\n",
-    "            print(\"Outputted response:\\n\", response.content)\n",
     "            print(\"Retrying... Attempt No.\", attempt_idx + 1)\n",
-    "            raise e\n",
     "            time.sleep(2)\n",
     "\n",
     "    print(\"Final raw LLM Response:\")\n",
@@ -509,11 +520,16 @@
     "            - The parts may be obvious to find, like \"a)...\", \"b)...\", or, \"i)...\", \"ii)...\", etc, or they could be implied by the question itself. All question must have at least one part, if there is only one part.\n",
     "                1. The stem should be placed into the \"content\" field. Text in this field should be valid in the Milkdown editor. \n",
     "                2. the parts of the question (subquestions) should be placed into the \"parts\" field. Text in this field should be valid under Lexdown.\n",
-    "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, NO markdown code blocks, NO extra text, NO explanations. Use plain newlines (not escaped as `\\n`).\n",
+    "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately\n",
+    "            - NO markdown code blocks, NO extra text, NO explanations.\n",
+    "            - Use plain newlines (not escaped as `\\n`).\n",
+    "            - In JSON strings, backslashes must be escaped. Use \\\\\\\\ for LaTeX backslashes.\n",
+    "            - Always have each field in the JSON, even if it is empty.\n",
+    "            - Becareful that the last element of a list is not followed by a comma.\n",
     "        6. The Text inside the JSON should be in Lexdown:\n",
     "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
     "            2. Do not remove or collapse blank lines.\n",
-    "            3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "            3. Do not escape characters like `\\n` or `\\\\` except for JSON requirements.\n",
     "    \"\"\"\n",
     "\n",
     "llm_task_seperate_parts_solution = \"\"\"\n",
@@ -523,11 +539,17 @@
     "        2. Use the same list of images as in the input for each question.\n",
     "        3. For each parts of the question (subquestions):\n",
     "            - Carefully try to find the solution for each part, and place it into the \"part_solution\" field. Otherwise, leave as empty string. Text in this field should be valid under Lexdown.\n",
-    "        5. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately, NO markdown code blocks, NO extra text, NO explanations. Use plain newlines (not escaped as `\\n`).\n",
-    "        6. The Text inside the JSON should be in Lexdown:\n",
+    "            - Make sure that the solution is only for the particular part.\n",
+    "        4. Output only a valid, plain, raw JSON string matching the schema above, ready to parse immediately\n",
+    "            - NO markdown code blocks, NO extra text, NO explanations.\n",
+    "            - Use plain newlines (not escaped as `\\n`).\n",
+    "            - In JSON strings, backslashes must be escaped. Use \\\\\\\\ for LaTeX backslashes.\n",
+    "            - Always have each field in the JSON, even if it is empty.\n",
+    "            - Becareful that the last element of a list is not followed by a comma.\n",
+    "        5. The Text inside the JSON should be in Lexdown:\n",
     "            1. preserving all LaTeX math delimiters (`$...$` and `$$...$$`) and all formatting exactly as in the input, without paraphrasing, summarizing, or simplifying any mathematical expressions or formulas.\n",
     "            2. Do not remove or collapse blank lines.\n",
-    "            3. Do not escape characters like `\\n` or `\\\\`.\n",
+    "            3. Do not escape characters like `\\n` or `\\\\` except for JSON requirements.\n",
     "    \"\"\"\n",
     "\n",
     "def extract_parts_question(questions_dict: dict) -> dict:\n",
@@ -584,7 +606,7 @@
     "                \"\"\"\n",
     "            \n",
     "            # Call the LLM\n",
-    "            response = llm.invoke(prompt)\n",
+    "            response = llm_mini.invoke(prompt)\n",
     "\n",
     "            # Debug: print the raw LLM response\n",
     "            # print(\"Raw LLM Response:\")\n",
@@ -594,6 +616,7 @@
     "                # Parse the response using the output parser.\n",
     "                parsed_output_parts = question_parser.parse(response.content)\n",
     "                print(f\"LLM response successfully parsed question {question_idx + 1}.\")\n",
+    "                print(parsed_output_parts.content)\n",
     "                # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "                question_parse_success = True\n",
     "                break\n",
@@ -630,12 +653,13 @@
     "                    \"\"\"\n",
     "                \n",
     "                # Call the LLM\n",
-    "                response = llm.invoke(prompt)\n",
+    "                response = llm_mini.invoke(prompt)\n",
     "\n",
     "                try:\n",
     "                    # Parse the response using the output parser.\n",
     "                    parsed_output_solution_part = solution_parser.parse(response.content)\n",
     "                    print(f\"LLM response successfully parsed solution for part {part_idx + 1} of question {question_idx + 1}.\")\n",
+    "                    print(response.content)\n",
     "                    # For Pydantic v2, use model_dump() to convert the model to a dictionary.\n",
     "                    solution_parse_success = True\n",
     "                    break\n",
@@ -688,10 +712,9 @@
     "\n",
     "llm_task_expression_check = r\"\"\"\n",
     "    Look inside the structure, specifically the `part_text` and `part_solution` fields. Ensure that the JSON content follows these rules:\n",
-    "        1. No extra escaping: The JSON string must contain no literal `\\\\n`, `\\\\\\\\`, or unnecessary escape sequences unless they are explicitly present in the original input text.\n",
-    "        2. Careful to make the distinction between inline and display math, i.e. do not mess up the use of `$` and `$$`.\n",
-    "        3. Math delimiters: All LaTeX math commands and math macros must be fully enclosed within math delimiters — use `$...$` for inline math, and `$$...$$` for display math.\n",
-    "        4. Balanced delimiters:\n",
+    "        1. JSON escaping: In JSON strings, backslashes must be escaped. Use \\\\\\\\ for LaTeX backslashes (e.g., \"$A \\\\\\\\cup B$\" not \"$A \\\\cup B$\").\n",
+    "        2. Math delimiters: All LaTeX math commands and math macros must be fully enclosed within math delimiters — use `$...$` for inline math, and `$$...$$` for display math.\n",
+    "        3. Balanced delimiters:\n",
     "            - All `$$` and `$` must be properly opened and closed.\n",
     "            - No unbalanced or partial math blocks.\n",
     "        4. Display math formatting:\n",
@@ -702,17 +725,18 @@
     "            - `$...$` should not span multiple lines.\n",
     "            - Avoid using `$$` for short inline expressions.\n",
     "        6. Preserve LaTeX syntax:\n",
-    "            - All LaTeX commands, backslashes (`\\`), braces (`{}`, `[]`), and special characters must be preserved exactly as in the original input.\n",
-    "            - Do not add or remove escaping.\n",
+    "            - All LaTeX commands, braces (`{}`, `[]`), and special characters must be preserved exactly as in the original input.\n",
+    "            - Remember: in JSON, use \\\\\\\\ for each LaTeX backslash.\n",
     "        7. Blank lines:\n",
     "            - Preserve all blank lines inside math blocks.\n",
     "            - Outside math, follow the structure of the original input.\n",
     "        8. Alt text and image URLs:\n",
     "            - Ensure that all image URLs and alt text are preserved as they appear in the original input.\n",
     "            - The alt text must be `pictureTag`.\n",
-    "        8. Output format:\n",
+    "        9. Output format:\n",
     "            - Output a single valid JSON string.\n",
     "            - Do not include any extra characters, explanations, or escaped formatting outside the JSON structure.\n",
+    "            - No literal \\\\n sequences - use actual newlines in JSON strings.\n",
     "    \"\"\"\n",
     "\n",
     "\n",
@@ -762,7 +786,7 @@
     "                    \"\"\"\n",
     "\n",
     "                # Call the LLM\n",
-    "                response = llm.invoke(validation_prompt)\n",
+    "                response = llm_nano.invoke(validation_prompt)\n",
     "\n",
     "                try:\n",
     "                    # Parse the response using the output parser.\n",
@@ -826,10 +850,10 @@
     "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
-    "    # corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
-    "    # print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
+    "    corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
+    "    print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
     "\n",
-    "    questions_dict = extract_questions(md_content)\n",
+    "    questions_dict = extract_questions(corrected_md_content)\n",
     "    print(\"successfully extracted the questions from the markdown. Now extracting the parts...\")\n",
     "\n",
     "    extracted_dict = extract_parts_question(questions_dict)\n",