added more comments to code, cleaned up code, removed unused components

HarrySu123 · HarrySu123 · commit b5b3e33eaf90 · 2025-07-18T14:03:42.000+01:00
diff --git a/conversion2025/converter_struggles.txt b/conversion2025/converter_struggles.txt
@@ -0,0 +1,23 @@
+struggles with questions with the structure:
+```
+main content
+    parts
+    parts
+    parts
+2nd main content
+```
+interprets 2nd main content as another part
+
+================================================
+
+does not konw how to split sub questions within subquestion
+```
+mian content
+    part a 
+        part I
+        part ii
+        ...
+    part b
+    ...
+...
+```
diff --git a/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb b/conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb
@@ -83,6 +83,7 @@
     "        print(\"PDF ID:\", pdf_id)\n",
     "        print(\"Response:\", r.json())\n",
     "\n",
+    "        # url of where the location of the processed PDF will be\n",
     "        url = f\"https://api.mathpix.com/v3/pdf/{pdf_id}.md\"\n",
     "        headers = {\n",
     "            \"app_id\": MATHPIX_APP_ID,\n",
@@ -121,12 +122,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# location of the output folder and media folder.\n",
     "folder_path = \"conversion_content\"\n",
     "output_path = f\"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out\"\n",
     "media_path = f\"{output_path}/media\"\n",
     "\n",
+    "# Create output and media directories if they do not exist.\n",
     "Path(media_path).mkdir(parents=True, exist_ok=True)\n",
     "\n",
+    "# location of the source pdf file and the result markdown file.\n",
     "source_path = f\"{folder_path}/example.pdf\"\n",
     "result_path = f\"{output_path}/example.md\"\n",
     "\n",
@@ -139,6 +143,7 @@
     "        print(f\"Error: Source PDF file not found at {source_path}\")\n",
     "        exit(1)\n",
     "\n",
+    "# Read the markdown content from the result file.\n",
     "try:\n",
     "    with open(result_path, \"r\") as f:\n",
     "        md_content = f.read()\n",
@@ -176,31 +181,22 @@
     "    \"\"\"\n",
     "    figures = {}\n",
     "    # Regex to match figure references and their descriptions\n",
+    "    # Matches ![alt text](url) format for images\n",
     "    pattern = r'!\\[.*?\\]\\((.*?)\\)'\n",
     "    matches = re.findall(pattern, text)\n",
     "    print(f\"Matches found: {matches}\")\n",
     "    \n",
     "    for match in matches:\n",
     "        url = match\n",
     "        url = url.strip()\n",
-    "        figure_caption_pattern = rf'\\({re.escape(url)}\\)\\s*-?\\s*Figure\\s+(Q\\d+)\\s*-\\s*(.+?)\\n'\n",
-    "        caption_match = re.search(figure_caption_pattern, text)\n",
-    "\n",
-    "        if caption_match:\n",
-    "            title, description = caption_match.groups()\n",
-    "            print(\"Caption match found\")\n",
-    "        else:\n",
-    "            title, description = \"\", \"\"\n",
-    "\n",
+    "        \n",
     "        if url.startswith(\"http\"):\n",
     "            # Download the image and save it to a file\n",
     "            image = Image.open(requests.get(url, stream=True).raw)\n",
     "            # Create a figure name based on the URL\n",
     "            fig_name = os.path.basename(url)\n",
     "            figures[fig_name] = {\n",
     "                \"image\": image,\n",
-    "                \"title\": title.strip(),\n",
-    "                \"label\": description.strip(),\n",
     "                \"url\": url,\n",
     "                \"local_path\": \"\",\n",
     "                # \"answerFile\": ans\n",
@@ -228,8 +224,11 @@
    "source": [
     "def save_figures_to_path(figures):\n",
     "    for idx, (fig_name, fig_info) in enumerate(figures.items()):\n",
-    "        print(f\"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'\")\n",
+    "        print(f\"URL='{fig_info['url']}'\")\n",
+    "\n",
     "        # Extract file extension and create a clean filename\n",
+    "        # Mathpix leaves image urls like `image.png?width=800&height=600`\n",
+    "        # We only want the base name without query parameters.\n",
     "        if \"?\" in fig_name:\n",
     "            end_location = fig_name.index(\"?\")\n",
     "            image_name = f\"{idx}_{fig_name[:end_location]}\"\n",
@@ -238,6 +237,7 @@
     "        \n",
     "        fig_info[\"local_path\"] = image_name\n",
     "        try:\n",
+    "            # Saves the image to the media path\n",
     "            fig_info[\"image\"].save(f\"{media_path}/{fig_info['local_path']}\")\n",
     "            print(f\"Saved image: {fig_info['local_path']}\")\n",
     "        except Exception as e:\n",
@@ -307,7 +307,7 @@
     "            api_key=os.environ[\"OPENAI_API_KEY\"],\n",
     "        )\n",
     "\n",
-    "# Uses gpt-4o-mini:\n",
+    "# Uses gpt-4.1-mini:\n",
     "#    - more intelligent\n",
     "llm_mini = ChatOpenAI(\n",
     "            model=\"gpt-4.1-mini\",\n",
@@ -349,7 +349,7 @@
     "\"\"\"\n",
     "\n",
     "def correct_mistakes_in_markdown(md_content: str) -> str:\n",
-    "    prompt = f\"\"\"\n",
+    "    correct_mistakes_prompt = f\"\"\"\n",
     "        {llm_task_correct_mistakes}\n",
     "\n",
     "        ```input\n",
@@ -359,7 +359,7 @@
     "        Return the markdown now.\n",
     "    \"\"\"\n",
     "\n",
-    "    response = llm_nano.invoke(prompt)\n",
+    "    response = llm_nano.invoke(correct_mistakes_prompt)\n",
     "    print(\"Corrected markdown content:\")\n",
     "    print(response.content.strip())\n",
     "\n",
@@ -401,7 +401,7 @@
     "        -   Identify the `year` if mentioned; otherwise, use \"0\".\n",
     "        -   For each question, carefully extract the full question text into `question_content` and the corresponding full solution/answer text into `solution_content`. They may not be in the same section.\n",
     "        -   If no solution is found, leave `solution_content` as an empty string `\"\"`.\n",
-    "        -   Preserve all image tags like `![pictureTag](filename.jpg)`, making sure they are placed with their respective \"question_content\" and \"solution_content\".\n",
+    "        -   Preserve all image tags like `![pictureTag](filename.jpg)`, making sure they are placed with their respective \"question_content\" and \"solution_content\". Do not duplicate it.\n",
     "        -   For Each Question extract all image references (e.g., `filename.jpg`) found within the `question_content` and `solution_content` and place them in the `images` list.\n",
     "\n",
     "    2.  **Output Format (Crucial):**\n",
@@ -420,7 +420,8 @@
     "    # Initialise the parser for the output.\n",
     "    parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)\n",
     "\n",
-    "    prompt = f\"\"\"\n",
+    "    # Prompt for the LLM to extract questions.\n",
+    "    seperate_questions_prompt = f\"\"\"\n",
     "        Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
     "        {parser.get_format_instructions()}\n",
     "\n",
@@ -437,7 +438,7 @@
     "    for attempt_idx in range(3):\n",
     "        \n",
     "        # Call the LLM\n",
-    "        response = llm_mini.invoke(prompt)\n",
+    "        response = llm_mini.invoke(seperate_questions_prompt)\n",
     "\n",
     "        # Debug: print the raw LLM response\n",
     "        # print(\"Raw LLM Response:\")\n",
@@ -551,7 +552,7 @@
     "        -   Identify all sub-questions (e.g., \"(a)\", \"(b)\", \"i.\", \"ii.\") and place their text into the `parts` list. Sub-questions may also be implied.\n",
     "        -   Questions with no sub-questions should have a single part in the `parts` list, which is the entire question text.\n",
     "        -   Ensure that images references are correctly placed with their respective parts.\n",
-    "        -   Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.\n",
+    "        -   Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`. Do not duplicate images.\n",
     "        -   Ensure no solution content is included in the `content` or `parts` fields.\n",
     "        -  You may choose what the title of the question should be.\n",
     "        -   The `images` list should be copied exactly from the input.\n",
@@ -572,10 +573,10 @@
     "    Please follow these rules carefully:\n",
     "\n",
     "    1.  **Content Extraction:**\n",
-    "        -   From the `full solution`, find the worked solution that corresponds to the given `question part`.\n",
-    "        -   Make sure the solutions for all parts together include the entire full solution text, with no missing content.\n",
+    "        -   From the `full solution`, find the worked solution that corresponds to the given `target question part`.\n",
+    "        -   Use the full question content and full question parts to help identify the correct parts of the solution to be extracted.\n",
     "        -   Place this exact text into the `part_solution` field.\n",
-    "        -   Ensure that images references are correctly placed with their respective parts.\n",
+    "        -   Ensure that images references are correctly placed with their respective parts. Do not duplicate images.\n",
     "        -   Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.\n",
     "        -   If no specific solution is found, use an empty string `\"\"`.\n",
     "\n",
@@ -599,7 +600,9 @@
     "    \n",
     "    # Process the question part\n",
     "    for attempt_idx in range(3):\n",
-    "        prompt = f\"\"\"\n",
+    "        # Prompt for the LLM to extract The question parts.\n",
+    "        # Use the full question content and the images to extract the parts.\n",
+    "        seperate_parts_question_prompt = f\"\"\"\n",
     "            Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
     "            {question_parser.get_format_instructions()}\n",
     "\n",
@@ -612,9 +615,9 @@
     "\n",
     "            Return the JSON now.\n",
     "            \"\"\"\n",
-    "        \n",
-    "        response = llm_mini.invoke(prompt)\n",
-    "        \n",
+    "\n",
+    "        response = llm_mini.invoke(seperate_parts_question_prompt)\n",
+    "\n",
     "        try:\n",
     "            parsed_output_parts = question_parser.parse(response.content)\n",
     "            print(f\"LLM response successfully parsed question {question_idx + 1}.\")\n",
@@ -634,7 +637,9 @@
     "        solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part)\n",
     "        \n",
     "        for attempt_idx in range(3):\n",
-    "            prompt = f\"\"\"\n",
+    "            # Prompt for the LLM to extract The solution part.\n",
+    "            # Use the full solution content and the part to extract the specific solution.\n",
+    "            seperate_parts_solution_prompt = f\"\"\"\n",
     "                Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
     "                {solution_parser.get_format_instructions()}\n",
     "\n",
@@ -643,11 +648,17 @@
     "                full solution:\n",
     "                {question[\"solution_content\"]}\n",
     "\n",
-    "                question part:\n",
+    "                full question content:\n",
+    "                {parsed_output_parts.content}\n",
+    "\n",
+    "                full question parts:\n",
+    "                {parsed_output_parts.parts}\n",
+    "\n",
+    "                target question part:\n",
     "                {part}\n",
     "                \"\"\"\n",
     "            \n",
-    "            response = llm_mini.invoke(prompt)\n",
+    "            response = llm_mini.invoke(seperate_parts_solution_prompt)\n",
     "            \n",
     "            try:\n",
     "                cleaned_response = escape_latex_backslashes(response.content.strip())\n",
@@ -727,6 +738,8 @@
     "        -   Be careful to not wrap text that is already correctly formatted with LaTeX math delimiters.\n",
     "    3.  **Display Math Formatting:** This rule is critical. Display math blocks MUST be formatted strictly as follows: a blank line, the opening `$$` on its own line, the LaTeX content, the closing `$$` on its own line, and a blank line.\n",
     "        -   **Incorrect:** `...text $$x=y$$ more text...`\n",
+    "        -   **Incorrect:** `...text$$x=y$$\\nmore text...`\n",
+    "        -   **Incorrect:** `...text\\n$$x=y$$more text...`\n",
     "        -   **Incorrect:** `...text\\n$$\\nx=y\\n\\n$$\\nmore text...`\n",
     "        -   **Correct:** `...text\\n\\n$$\\nx=y\\n$$\\n\\nmore text...`\n",
     "    4.  **LaTeX Environments:** Environments like `aligned`, `cases`, `matrix`, `gathered`, etc., must be entirely contained within a single display math block (`$$...$$`). Ensure that every `\\begin{...}` has a matching `\\end{...}`.\n",
@@ -786,7 +799,7 @@
     "        part_solution_validation_data = {\n",
     "            \"part_solution\": part_solution\n",
     "        }\n",
-    "        \n",
+    "\n",
     "        validation_prompt = f\"\"\"\n",
     "            Your task is to extract a JSON with the following structure exactly, to be parsed by a pydantic model:\n",
     "            {part_solution_parser.get_format_instructions()}\n",
@@ -937,10 +950,10 @@
     "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
-    "    corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
-    "    print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
+    "    # corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
+    "    # print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
     "\n",
-    "    questions_dict = extract_questions(corrected_md_content)\n",
+    "    questions_dict = extract_questions(md_content)\n",
     "    print(\"successfully extracted the questions from the markdown. Now extracting the parts...\")\n",
     "\n",
     "    extracted_dict = extract_parts_question(questions_dict)\n",
@@ -965,7 +978,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "imported_tutorial = md_to_json(md_content)"
+    "full_json_question_set = md_to_json(md_content)"
    ]
   },
   {
@@ -984,24 +997,22 @@
    "outputs": [],
    "source": [
     "# Extract title\n",
-    "title = imported_tutorial[\"name\"] + \" \" + imported_tutorial[\"year\"]\n",
+    "title = full_json_question_set[\"name\"] + \" \" + full_json_question_set[\"year\"]\n",
     "\n",
     "# Print the title\n",
     "print(f\"Title: {title}\\n\")\n",
     "\n",
     "# Extract questions\n",
-    "questions = imported_tutorial[\"questions\"]\n",
-    "\n",
-    "print(questions)\n",
+    "questions = full_json_question_set[\"questions\"]\n",
     "\n",
     "# Loop over and print each question\n",
-    "for idx1, question in enumerate(questions, start=1):\n",
-    "    print(f\"**Question {idx1}**:\\n{question.get('title')}\\n\")\n",
+    "for question_idx, question in enumerate(questions, start=1):\n",
+    "    print(f\"**Question {question_idx}**:\\n{question.get('title')}\\n\")\n",
     "    print(f\"Content: {question.get('content')}\\n\")\n",
-    "    for idx2, (part, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n",
-    "        print(f\"Question {idx1}:\")\n",
-    "        print(f\"- Subquestion {idx2}: {part}\")\n",
-    "        print(f\"- Worked Solution {idx2}: {part_answer}\")\n",
+    "    for part_idx, (part_question, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n",
+    "        print(f\"Question {question_idx}:\")\n",
+    "        print(f\"- Subquestion {part_idx}: {part_question}\")\n",
+    "        print(f\"- Worked Solution {part_idx}: {part_answer}\")\n",
     "        print(\"\\n\")\n",
     "    print(\"-\" * 40)  # Separator for readability"
    ]
@@ -1021,12 +1032,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "questions = imported_tutorial[\"questions\"]\n",
+    "questions = full_json_question_set[\"questions\"]\n",
     "\n",
     "in2lambda_questions = []\n",
     "\n",
     "# Loop over all questions and question_answers and use in2lambda API to create a JSON.\n",
-    "for idx, question_dict in enumerate(questions, start=1):\n",
+    "for question_idx, question_dict in enumerate(questions, start=1):\n",
     "    parts = []\n",
     "    for part_question, part_solution in zip(question_dict.get(\"parts\", []), question_dict.get(\"parts_solutions\", [])):\n",
     "        part_obj = Part(\n",
@@ -1048,7 +1059,7 @@
     "            print(f\"Warning: Image file not found: {full_path}\")\n",
     "\n",
     "    question = Question(\n",
-    "        title=question_dict.get(\"title\", f\"Question {idx}\"),\n",
+    "        title=question_dict.get(\"title\", f\"Question {question_idx}\"),\n",
     "        main_text=question_dict.get(\"content\", \"\"),\n",
     "        parts=parts,\n",
     "        images=image_paths\n",
diff --git a/conversion2025/todo.txt b/conversion2025/todo.txt
@@ -1,4 +1,5 @@
 TODO: 
 
 ensure no answer in question itself, only in parts_solutions.
-properly handles images allocation
+properly handles images allocation
+give llm access to full question to give better idea on how to split the solution into parts