Skip to content

Commit b5b3e33

Browse files
committed
added more comments to code, cleaned up code, removed unused components
1 parent 9d9c6dc commit b5b3e33

File tree

3 files changed

+83
-48
lines changed

3 files changed

+83
-48
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
struggles with questions with the structure:
2+
```
3+
main content
4+
parts
5+
parts
6+
parts
7+
2nd main content
8+
```
9+
interprets 2nd main content as another part
10+
11+
================================================
12+
13+
does not konw how to split sub questions within subquestion
14+
```
15+
mian content
16+
part a
17+
part I
18+
part ii
19+
...
20+
part b
21+
...
22+
...
23+
```

conversion2025/mathpix_to_llm_to_in2lambda_to_JSON.ipynb

Lines changed: 58 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
" print(\"PDF ID:\", pdf_id)\n",
8484
" print(\"Response:\", r.json())\n",
8585
"\n",
86+
" # url of where the location of the processed PDF will be\n",
8687
" url = f\"https://api.mathpix.com/v3/pdf/{pdf_id}.md\"\n",
8788
" headers = {\n",
8889
" \"app_id\": MATHPIX_APP_ID,\n",
@@ -121,12 +122,15 @@
121122
"metadata": {},
122123
"outputs": [],
123124
"source": [
125+
"# location of the output folder and media folder.\n",
124126
"folder_path = \"conversion_content\"\n",
125127
"output_path = f\"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out\"\n",
126128
"media_path = f\"{output_path}/media\"\n",
127129
"\n",
130+
"# Create output and media directories if they do not exist.\n",
128131
"Path(media_path).mkdir(parents=True, exist_ok=True)\n",
129132
"\n",
133+
"# location of the source pdf file and the result markdown file.\n",
130134
"source_path = f\"{folder_path}/example.pdf\"\n",
131135
"result_path = f\"{output_path}/example.md\"\n",
132136
"\n",
@@ -139,6 +143,7 @@
139143
" print(f\"Error: Source PDF file not found at {source_path}\")\n",
140144
" exit(1)\n",
141145
"\n",
146+
"# Read the markdown content from the result file.\n",
142147
"try:\n",
143148
" with open(result_path, \"r\") as f:\n",
144149
" md_content = f.read()\n",
@@ -176,31 +181,22 @@
176181
" \"\"\"\n",
177182
" figures = {}\n",
178183
" # Regex to match figure references and their descriptions\n",
184+
" # Matches ![alt text](url) format for images\n",
179185
" pattern = r'!\\[.*?\\]\\((.*?)\\)'\n",
180186
" matches = re.findall(pattern, text)\n",
181187
" print(f\"Matches found: {matches}\")\n",
182188
" \n",
183189
" for match in matches:\n",
184190
" url = match\n",
185191
" url = url.strip()\n",
186-
" figure_caption_pattern = rf'\\({re.escape(url)}\\)\\s*-?\\s*Figure\\s+(Q\\d+)\\s*-\\s*(.+?)\\n'\n",
187-
" caption_match = re.search(figure_caption_pattern, text)\n",
188-
"\n",
189-
" if caption_match:\n",
190-
" title, description = caption_match.groups()\n",
191-
" print(\"Caption match found\")\n",
192-
" else:\n",
193-
" title, description = \"\", \"\"\n",
194-
"\n",
192+
" \n",
195193
" if url.startswith(\"http\"):\n",
196194
" # Download the image and save it to a file\n",
197195
" image = Image.open(requests.get(url, stream=True).raw)\n",
198196
" # Create a figure name based on the URL\n",
199197
" fig_name = os.path.basename(url)\n",
200198
" figures[fig_name] = {\n",
201199
" \"image\": image,\n",
202-
" \"title\": title.strip(),\n",
203-
" \"label\": description.strip(),\n",
204200
" \"url\": url,\n",
205201
" \"local_path\": \"\",\n",
206202
" # \"answerFile\": ans\n",
@@ -228,8 +224,11 @@
228224
"source": [
229225
"def save_figures_to_path(figures):\n",
230226
" for idx, (fig_name, fig_info) in enumerate(figures.items()):\n",
231-
" print(f\"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'\")\n",
227+
" print(f\"URL='{fig_info['url']}'\")\n",
228+
"\n",
232229
" # Extract file extension and create a clean filename\n",
230+
" # Mathpix leaves image urls like `image.png?width=800&height=600`\n",
231+
" # We only want the base name without query parameters.\n",
233232
" if \"?\" in fig_name:\n",
234233
" end_location = fig_name.index(\"?\")\n",
235234
" image_name = f\"{idx}_{fig_name[:end_location]}\"\n",
@@ -238,6 +237,7 @@
238237
" \n",
239238
" fig_info[\"local_path\"] = image_name\n",
240239
" try:\n",
240+
" # Saves the image to the media path\n",
241241
" fig_info[\"image\"].save(f\"{media_path}/{fig_info['local_path']}\")\n",
242242
" print(f\"Saved image: {fig_info['local_path']}\")\n",
243243
" except Exception as e:\n",
@@ -307,7 +307,7 @@
307307
" api_key=os.environ[\"OPENAI_API_KEY\"],\n",
308308
" )\n",
309309
"\n",
310-
"# Uses gpt-4o-mini:\n",
310+
"# Uses gpt-4.1-mini:\n",
311311
"# - more intelligent\n",
312312
"llm_mini = ChatOpenAI(\n",
313313
" model=\"gpt-4.1-mini\",\n",
@@ -349,7 +349,7 @@
349349
"\"\"\"\n",
350350
"\n",
351351
"def correct_mistakes_in_markdown(md_content: str) -> str:\n",
352-
" prompt = f\"\"\"\n",
352+
" correct_mistakes_prompt = f\"\"\"\n",
353353
" {llm_task_correct_mistakes}\n",
354354
"\n",
355355
" ```input\n",
@@ -359,7 +359,7 @@
359359
" Return the markdown now.\n",
360360
" \"\"\"\n",
361361
"\n",
362-
" response = llm_nano.invoke(prompt)\n",
362+
" response = llm_nano.invoke(correct_mistakes_prompt)\n",
363363
" print(\"Corrected markdown content:\")\n",
364364
" print(response.content.strip())\n",
365365
"\n",
@@ -401,7 +401,7 @@
401401
" - Identify the `year` if mentioned; otherwise, use \"0\".\n",
402402
" - For each question, carefully extract the full question text into `question_content` and the corresponding full solution/answer text into `solution_content`. They may not be in the same section.\n",
403403
" - If no solution is found, leave `solution_content` as an empty string `\"\"`.\n",
404-
" - Preserve all image tags like `![pictureTag](filename.jpg)`, making sure they are placed with their respective \"question_content\" and \"solution_content\".\n",
404+
" - Preserve all image tags like `![pictureTag](filename.jpg)`, making sure they are placed with their respective \"question_content\" and \"solution_content\". Do not duplicate it.\n",
405405
" - For Each Question extract all image references (e.g., `filename.jpg`) found within the `question_content` and `solution_content` and place them in the `images` list.\n",
406406
"\n",
407407
" 2. **Output Format (Crucial):**\n",
@@ -420,7 +420,8 @@
420420
" # Initialise the parser for the output.\n",
421421
" parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)\n",
422422
"\n",
423-
" prompt = f\"\"\"\n",
423+
" # Prompt for the LLM to extract questions.\n",
424+
" seperate_questions_prompt = f\"\"\"\n",
424425
" Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
425426
" {parser.get_format_instructions()}\n",
426427
"\n",
@@ -437,7 +438,7 @@
437438
" for attempt_idx in range(3):\n",
438439
" \n",
439440
" # Call the LLM\n",
440-
" response = llm_mini.invoke(prompt)\n",
441+
" response = llm_mini.invoke(seperate_questions_prompt)\n",
441442
"\n",
442443
" # Debug: print the raw LLM response\n",
443444
" # print(\"Raw LLM Response:\")\n",
@@ -551,7 +552,7 @@
551552
" - Identify all sub-questions (e.g., \"(a)\", \"(b)\", \"i.\", \"ii.\") and place their text into the `parts` list. Sub-questions may also be implied.\n",
552553
" - Questions with no sub-questions should have a single part in the `parts` list, which is the entire question text.\n",
553554
" - Ensure that images references are correctly placed with their respective parts.\n",
554-
" - Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.\n",
555+
" - Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`. Do not duplicate images.\n",
555556
" - Ensure no solution content is included in the `content` or `parts` fields.\n",
556557
" - You may choose what the title of the question should be.\n",
557558
" - The `images` list should be copied exactly from the input.\n",
@@ -572,10 +573,10 @@
572573
" Please follow these rules carefully:\n",
573574
"\n",
574575
" 1. **Content Extraction:**\n",
575-
" - From the `full solution`, find the worked solution that corresponds to the given `question part`.\n",
576-
" - Make sure the solutions for all parts together include the entire full solution text, with no missing content.\n",
576+
" - From the `full solution`, find the worked solution that corresponds to the given `target question part`.\n",
577+
" - Use the full question content and full question parts to help identify the correct parts of the solution to be extracted.\n",
577578
" - Place this exact text into the `part_solution` field.\n",
578-
" - Ensure that images references are correctly placed with their respective parts.\n",
579+
" - Ensure that images references are correctly placed with their respective parts. Do not duplicate images.\n",
579580
" - Preserve all content perfectly, including text, LaTeX, and image tags like `![pictureTag](filename.jpg)`.\n",
580581
" - If no specific solution is found, use an empty string `\"\"`.\n",
581582
"\n",
@@ -599,7 +600,9 @@
599600
" \n",
600601
" # Process the question part\n",
601602
" for attempt_idx in range(3):\n",
602-
" prompt = f\"\"\"\n",
603+
" # Prompt for the LLM to extract The question parts.\n",
604+
" # Use the full question content and the images to extract the parts.\n",
605+
" seperate_parts_question_prompt = f\"\"\"\n",
603606
" Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
604607
" {question_parser.get_format_instructions()}\n",
605608
"\n",
@@ -612,9 +615,9 @@
612615
"\n",
613616
" Return the JSON now.\n",
614617
" \"\"\"\n",
615-
" \n",
616-
" response = llm_mini.invoke(prompt)\n",
617-
" \n",
618+
"\n",
619+
" response = llm_mini.invoke(seperate_parts_question_prompt)\n",
620+
"\n",
618621
" try:\n",
619622
" parsed_output_parts = question_parser.parse(response.content)\n",
620623
" print(f\"LLM response successfully parsed question {question_idx + 1}.\")\n",
@@ -634,7 +637,9 @@
634637
" solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part)\n",
635638
" \n",
636639
" for attempt_idx in range(3):\n",
637-
" prompt = f\"\"\"\n",
640+
" # Prompt for the LLM to extract The solution part.\n",
641+
" # Use the full solution content and the part to extract the specific solution.\n",
642+
" seperate_parts_solution_prompt = f\"\"\"\n",
638643
" Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n",
639644
" {solution_parser.get_format_instructions()}\n",
640645
"\n",
@@ -643,11 +648,17 @@
643648
" full solution:\n",
644649
" {question[\"solution_content\"]}\n",
645650
"\n",
646-
" question part:\n",
651+
" full question content:\n",
652+
" {parsed_output_parts.content}\n",
653+
"\n",
654+
" full question parts:\n",
655+
" {parsed_output_parts.parts}\n",
656+
"\n",
657+
" target question part:\n",
647658
" {part}\n",
648659
" \"\"\"\n",
649660
" \n",
650-
" response = llm_mini.invoke(prompt)\n",
661+
" response = llm_mini.invoke(seperate_parts_solution_prompt)\n",
651662
" \n",
652663
" try:\n",
653664
" cleaned_response = escape_latex_backslashes(response.content.strip())\n",
@@ -727,6 +738,8 @@
727738
" - Be careful to not wrap text that is already correctly formatted with LaTeX math delimiters.\n",
728739
" 3. **Display Math Formatting:** This rule is critical. Display math blocks MUST be formatted strictly as follows: a blank line, the opening `$$` on its own line, the LaTeX content, the closing `$$` on its own line, and a blank line.\n",
729740
" - **Incorrect:** `...text $$x=y$$ more text...`\n",
741+
" - **Incorrect:** `...text$$x=y$$\\nmore text...`\n",
742+
" - **Incorrect:** `...text\\n$$x=y$$more text...`\n",
730743
" - **Incorrect:** `...text\\n$$\\nx=y\\n\\n$$\\nmore text...`\n",
731744
" - **Correct:** `...text\\n\\n$$\\nx=y\\n$$\\n\\nmore text...`\n",
732745
" 4. **LaTeX Environments:** Environments like `aligned`, `cases`, `matrix`, `gathered`, etc., must be entirely contained within a single display math block (`$$...$$`). Ensure that every `\\begin{...}` has a matching `\\end{...}`.\n",
@@ -786,7 +799,7 @@
786799
" part_solution_validation_data = {\n",
787800
" \"part_solution\": part_solution\n",
788801
" }\n",
789-
" \n",
802+
"\n",
790803
" validation_prompt = f\"\"\"\n",
791804
" Your task is to extract a JSON with the following structure exactly, to be parsed by a pydantic model:\n",
792805
" {part_solution_parser.get_format_instructions()}\n",
@@ -937,10 +950,10 @@
937950
" dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
938951
" If parsing fails, returns None.\n",
939952
" \"\"\"\n",
940-
" corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
941-
" print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
953+
" # corrected_md_content = correct_mistakes_in_markdown(md_content)\n",
954+
" # print(\"Markdown content corrected for spelling, grammar, and structure.\")\n",
942955
"\n",
943-
" questions_dict = extract_questions(corrected_md_content)\n",
956+
" questions_dict = extract_questions(md_content)\n",
944957
" print(\"successfully extracted the questions from the markdown. Now extracting the parts...\")\n",
945958
"\n",
946959
" extracted_dict = extract_parts_question(questions_dict)\n",
@@ -965,7 +978,7 @@
965978
"metadata": {},
966979
"outputs": [],
967980
"source": [
968-
"imported_tutorial = md_to_json(md_content)"
981+
"full_json_question_set = md_to_json(md_content)"
969982
]
970983
},
971984
{
@@ -984,24 +997,22 @@
984997
"outputs": [],
985998
"source": [
986999
"# Extract title\n",
987-
"title = imported_tutorial[\"name\"] + \" \" + imported_tutorial[\"year\"]\n",
1000+
"title = full_json_question_set[\"name\"] + \" \" + full_json_question_set[\"year\"]\n",
9881001
"\n",
9891002
"# Print the title\n",
9901003
"print(f\"Title: {title}\\n\")\n",
9911004
"\n",
9921005
"# Extract questions\n",
993-
"questions = imported_tutorial[\"questions\"]\n",
994-
"\n",
995-
"print(questions)\n",
1006+
"questions = full_json_question_set[\"questions\"]\n",
9961007
"\n",
9971008
"# Loop over and print each question\n",
998-
"for idx1, question in enumerate(questions, start=1):\n",
999-
" print(f\"**Question {idx1}**:\\n{question.get('title')}\\n\")\n",
1009+
"for question_idx, question in enumerate(questions, start=1):\n",
1010+
" print(f\"**Question {question_idx}**:\\n{question.get('title')}\\n\")\n",
10001011
" print(f\"Content: {question.get('content')}\\n\")\n",
1001-
" for idx2, (part, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n",
1002-
" print(f\"Question {idx1}:\")\n",
1003-
" print(f\"- Subquestion {idx2}: {part}\")\n",
1004-
" print(f\"- Worked Solution {idx2}: {part_answer}\")\n",
1012+
" for part_idx, (part_question, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n",
1013+
" print(f\"Question {question_idx}:\")\n",
1014+
" print(f\"- Subquestion {part_idx}: {part_question}\")\n",
1015+
" print(f\"- Worked Solution {part_idx}: {part_answer}\")\n",
10051016
" print(\"\\n\")\n",
10061017
" print(\"-\" * 40) # Separator for readability"
10071018
]
@@ -1021,12 +1032,12 @@
10211032
"metadata": {},
10221033
"outputs": [],
10231034
"source": [
1024-
"questions = imported_tutorial[\"questions\"]\n",
1035+
"questions = full_json_question_set[\"questions\"]\n",
10251036
"\n",
10261037
"in2lambda_questions = []\n",
10271038
"\n",
10281039
"# Loop over all questions and question_answers and use in2lambda API to create a JSON.\n",
1029-
"for idx, question_dict in enumerate(questions, start=1):\n",
1040+
"for question_idx, question_dict in enumerate(questions, start=1):\n",
10301041
" parts = []\n",
10311042
" for part_question, part_solution in zip(question_dict.get(\"parts\", []), question_dict.get(\"parts_solutions\", [])):\n",
10321043
" part_obj = Part(\n",
@@ -1048,7 +1059,7 @@
10481059
" print(f\"Warning: Image file not found: {full_path}\")\n",
10491060
"\n",
10501061
" question = Question(\n",
1051-
" title=question_dict.get(\"title\", f\"Question {idx}\"),\n",
1062+
" title=question_dict.get(\"title\", f\"Question {question_idx}\"),\n",
10521063
" main_text=question_dict.get(\"content\", \"\"),\n",
10531064
" parts=parts,\n",
10541065
" images=image_paths\n",

conversion2025/todo.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
TODO:
22

33
ensure no answer in question itself, only in parts_solutions.
4-
properly handles images allocation
4+
properly handles images allocation
5+
give llm access to full question to give better idea on how to split the solution into parts

0 commit comments

Comments
 (0)