|
83 | 83 | " print(\"PDF ID:\", pdf_id)\n", |
84 | 84 | " print(\"Response:\", r.json())\n", |
85 | 85 | "\n", |
| 86 | + " # url of where the location of the processed PDF will be\n", |
86 | 87 | " url = f\"https://api.mathpix.com/v3/pdf/{pdf_id}.md\"\n", |
87 | 88 | " headers = {\n", |
88 | 89 | " \"app_id\": MATHPIX_APP_ID,\n", |
|
121 | 122 | "metadata": {}, |
122 | 123 | "outputs": [], |
123 | 124 | "source": [ |
| 125 | + "# location of the output folder and media folder.\n", |
124 | 126 | "folder_path = \"conversion_content\"\n", |
125 | 127 | "output_path = f\"{folder_path}/mathpix_to_llm_to_in2lambda_to_JSON_out\"\n", |
126 | 128 | "media_path = f\"{output_path}/media\"\n", |
127 | 129 | "\n", |
| 130 | + "# Create output and media directories if they do not exist.\n", |
128 | 131 | "Path(media_path).mkdir(parents=True, exist_ok=True)\n", |
129 | 132 | "\n", |
| 133 | + "# location of the source pdf file and the result markdown file.\n", |
130 | 134 | "source_path = f\"{folder_path}/example.pdf\"\n", |
131 | 135 | "result_path = f\"{output_path}/example.md\"\n", |
132 | 136 | "\n", |
|
139 | 143 | " print(f\"Error: Source PDF file not found at {source_path}\")\n", |
140 | 144 | " exit(1)\n", |
141 | 145 | "\n", |
| 146 | + "# Read the markdown content from the result file.\n", |
142 | 147 | "try:\n", |
143 | 148 | " with open(result_path, \"r\") as f:\n", |
144 | 149 | " md_content = f.read()\n", |
|
176 | 181 | " \"\"\"\n", |
177 | 182 | " figures = {}\n", |
178 | 183 | " # Regex to match figure references and their descriptions\n", |
| 184 | + " # Matches  format for images\n", |
179 | 185 | " pattern = r'!\\[.*?\\]\\((.*?)\\)'\n", |
180 | 186 | " matches = re.findall(pattern, text)\n", |
181 | 187 | " print(f\"Matches found: {matches}\")\n", |
182 | 188 | " \n", |
183 | 189 | " for match in matches:\n", |
184 | 190 | " url = match\n", |
185 | 191 | " url = url.strip()\n", |
186 | | - " figure_caption_pattern = rf'\\({re.escape(url)}\\)\\s*-?\\s*Figure\\s+(Q\\d+)\\s*-\\s*(.+?)\\n'\n", |
187 | | - " caption_match = re.search(figure_caption_pattern, text)\n", |
188 | | - "\n", |
189 | | - " if caption_match:\n", |
190 | | - " title, description = caption_match.groups()\n", |
191 | | - " print(\"Caption match found\")\n", |
192 | | - " else:\n", |
193 | | - " title, description = \"\", \"\"\n", |
194 | | - "\n", |
| 192 | + " \n", |
195 | 193 | " if url.startswith(\"http\"):\n", |
196 | 194 | " # Download the image and save it to a file\n", |
197 | 195 | " image = Image.open(requests.get(url, stream=True).raw)\n", |
198 | 196 | " # Create a figure name based on the URL\n", |
199 | 197 | " fig_name = os.path.basename(url)\n", |
200 | 198 | " figures[fig_name] = {\n", |
201 | 199 | " \"image\": image,\n", |
202 | | - " \"title\": title.strip(),\n", |
203 | | - " \"label\": description.strip(),\n", |
204 | 200 | " \"url\": url,\n", |
205 | 201 | " \"local_path\": \"\",\n", |
206 | 202 | " # \"answerFile\": ans\n", |
|
228 | 224 | "source": [ |
229 | 225 | "def save_figures_to_path(figures):\n", |
230 | 226 | " for idx, (fig_name, fig_info) in enumerate(figures.items()):\n", |
231 | | - " print(f\"FIGURE Title='{fig_info['title']}', Label='{fig_info['label']}', URL='{fig_info['url']}'\")\n", |
| 227 | + " print(f\"URL='{fig_info['url']}'\")\n", |
| 228 | + "\n", |
232 | 229 | " # Extract file extension and create a clean filename\n", |
| 230 | + " # Mathpix leaves image urls like `image.png?width=800&height=600`\n", |
| 231 | + " # We only want the base name without query parameters.\n", |
233 | 232 | " if \"?\" in fig_name:\n", |
234 | 233 | " end_location = fig_name.index(\"?\")\n", |
235 | 234 | " image_name = f\"{idx}_{fig_name[:end_location]}\"\n", |
|
238 | 237 | " \n", |
239 | 238 | " fig_info[\"local_path\"] = image_name\n", |
240 | 239 | " try:\n", |
| 240 | + " # Saves the image to the media path\n", |
241 | 241 | " fig_info[\"image\"].save(f\"{media_path}/{fig_info['local_path']}\")\n", |
242 | 242 | " print(f\"Saved image: {fig_info['local_path']}\")\n", |
243 | 243 | " except Exception as e:\n", |
|
307 | 307 | " api_key=os.environ[\"OPENAI_API_KEY\"],\n", |
308 | 308 | " )\n", |
309 | 309 | "\n", |
310 | | - "# Uses gpt-4o-mini:\n", |
| 310 | + "# Uses gpt-4.1-mini:\n", |
311 | 311 | "# - more intelligent\n", |
312 | 312 | "llm_mini = ChatOpenAI(\n", |
313 | 313 | " model=\"gpt-4.1-mini\",\n", |
|
349 | 349 | "\"\"\"\n", |
350 | 350 | "\n", |
351 | 351 | "def correct_mistakes_in_markdown(md_content: str) -> str:\n", |
352 | | - " prompt = f\"\"\"\n", |
| 352 | + " correct_mistakes_prompt = f\"\"\"\n", |
353 | 353 | " {llm_task_correct_mistakes}\n", |
354 | 354 | "\n", |
355 | 355 | " ```input\n", |
|
359 | 359 | " Return the markdown now.\n", |
360 | 360 | " \"\"\"\n", |
361 | 361 | "\n", |
362 | | - " response = llm_nano.invoke(prompt)\n", |
| 362 | + " response = llm_nano.invoke(correct_mistakes_prompt)\n", |
363 | 363 | " print(\"Corrected markdown content:\")\n", |
364 | 364 | " print(response.content.strip())\n", |
365 | 365 | "\n", |
|
401 | 401 | " - Identify the `year` if mentioned; otherwise, use \"0\".\n", |
402 | 402 | " - For each question, carefully extract the full question text into `question_content` and the corresponding full solution/answer text into `solution_content`. They may not be in the same section.\n", |
403 | 403 | " - If no solution is found, leave `solution_content` as an empty string `\"\"`.\n", |
404 | | - " - Preserve all image tags like ``, making sure they are placed with their respective \"question_content\" and \"solution_content\".\n", |
| 404 | + " - Preserve all image tags like ``, making sure they are placed with their respective \"question_content\" and \"solution_content\". Do not duplicate it.\n", |
405 | 405 | " - For Each Question extract all image references (e.g., `filename.jpg`) found within the `question_content` and `solution_content` and place them in the `images` list.\n", |
406 | 406 | "\n", |
407 | 407 | " 2. **Output Format (Crucial):**\n", |
|
420 | 420 | " # Initialise the parser for the output.\n", |
421 | 421 | " parser = PydanticOutputParser(pydantic_object=AllQuestionsModel)\n", |
422 | 422 | "\n", |
423 | | - " prompt = f\"\"\"\n", |
| 423 | + " # Prompt for the LLM to extract questions.\n", |
| 424 | + " seperate_questions_prompt = f\"\"\"\n", |
424 | 425 | " Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n", |
425 | 426 | " {parser.get_format_instructions()}\n", |
426 | 427 | "\n", |
|
437 | 438 | " for attempt_idx in range(3):\n", |
438 | 439 | " \n", |
439 | 440 | " # Call the LLM\n", |
440 | | - " response = llm_mini.invoke(prompt)\n", |
| 441 | + " response = llm_mini.invoke(seperate_questions_prompt)\n", |
441 | 442 | "\n", |
442 | 443 | " # Debug: print the raw LLM response\n", |
443 | 444 | " # print(\"Raw LLM Response:\")\n", |
|
551 | 552 | " - Identify all sub-questions (e.g., \"(a)\", \"(b)\", \"i.\", \"ii.\") and place their text into the `parts` list. Sub-questions may also be implied.\n", |
552 | 553 | " - Questions with no sub-questions should have a single part in the `parts` list, which is the entire question text.\n", |
553 | 554 | " - Ensure that images references are correctly placed with their respective parts.\n", |
554 | | - " - Preserve all content perfectly, including text, LaTeX, and image tags like ``.\n", |
| 555 | + " - Preserve all content perfectly, including text, LaTeX, and image tags like ``. Do not duplicate images.\n", |
555 | 556 | " - Ensure no solution content is included in the `content` or `parts` fields.\n", |
556 | 557 | " - You may choose what the title of the question should be.\n", |
557 | 558 | " - The `images` list should be copied exactly from the input.\n", |
|
572 | 573 | " Please follow these rules carefully:\n", |
573 | 574 | "\n", |
574 | 575 | " 1. **Content Extraction:**\n", |
575 | | - " - From the `full solution`, find the worked solution that corresponds to the given `question part`.\n", |
576 | | - " - Make sure the solutions for all parts together include the entire full solution text, with no missing content.\n", |
| 576 | + " - From the `full solution`, find the worked solution that corresponds to the given `target question part`.\n", |
| 577 | + " - Use the full question content and full question parts to help identify the correct parts of the solution to be extracted.\n", |
577 | 578 | " - Place this exact text into the `part_solution` field.\n", |
578 | | - " - Ensure that images references are correctly placed with their respective parts.\n", |
| 579 | + " - Ensure that images references are correctly placed with their respective parts. Do not duplicate images.\n", |
579 | 580 | " - Preserve all content perfectly, including text, LaTeX, and image tags like ``.\n", |
580 | 581 | " - If no specific solution is found, use an empty string `\"\"`.\n", |
581 | 582 | "\n", |
|
599 | 600 | " \n", |
600 | 601 | " # Process the question part\n", |
601 | 602 | " for attempt_idx in range(3):\n", |
602 | | - " prompt = f\"\"\"\n", |
| 603 | + " # Prompt for the LLM to extract The question parts.\n", |
| 604 | + " # Use the full question content and the images to extract the parts.\n", |
| 605 | + " seperate_parts_question_prompt = f\"\"\"\n", |
603 | 606 | " Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n", |
604 | 607 | " {question_parser.get_format_instructions()}\n", |
605 | 608 | "\n", |
|
612 | 615 | "\n", |
613 | 616 | " Return the JSON now.\n", |
614 | 617 | " \"\"\"\n", |
615 | | - " \n", |
616 | | - " response = llm_mini.invoke(prompt)\n", |
617 | | - " \n", |
| 618 | + "\n", |
| 619 | + " response = llm_mini.invoke(seperate_parts_question_prompt)\n", |
| 620 | + "\n", |
618 | 621 | " try:\n", |
619 | 622 | " parsed_output_parts = question_parser.parse(response.content)\n", |
620 | 623 | " print(f\"LLM response successfully parsed question {question_idx + 1}.\")\n", |
|
634 | 637 | " solution_parser = PydanticOutputParser(pydantic_object=Set_Solution_Part)\n", |
635 | 638 | " \n", |
636 | 639 | " for attempt_idx in range(3):\n", |
637 | | - " prompt = f\"\"\"\n", |
| 640 | + " # Prompt for the LLM to extract The solution part.\n", |
| 641 | + " # Use the full solution content and the part to extract the specific solution.\n", |
| 642 | + " seperate_parts_solution_prompt = f\"\"\"\n", |
638 | 643 | " Your task is to extract a JSON with the following structure exactly, ready to be parsed by a pydantic model:\n", |
639 | 644 | " {solution_parser.get_format_instructions()}\n", |
640 | 645 | "\n", |
|
643 | 648 | " full solution:\n", |
644 | 649 | " {question[\"solution_content\"]}\n", |
645 | 650 | "\n", |
646 | | - " question part:\n", |
| 651 | + " full question content:\n", |
| 652 | + " {parsed_output_parts.content}\n", |
| 653 | + "\n", |
| 654 | + " full question parts:\n", |
| 655 | + " {parsed_output_parts.parts}\n", |
| 656 | + "\n", |
| 657 | + " target question part:\n", |
647 | 658 | " {part}\n", |
648 | 659 | " \"\"\"\n", |
649 | 660 | " \n", |
650 | | - " response = llm_mini.invoke(prompt)\n", |
| 661 | + " response = llm_mini.invoke(seperate_parts_solution_prompt)\n", |
651 | 662 | " \n", |
652 | 663 | " try:\n", |
653 | 664 | " cleaned_response = escape_latex_backslashes(response.content.strip())\n", |
|
727 | 738 | " - Be careful to not wrap text that is already correctly formatted with LaTeX math delimiters.\n", |
728 | 739 | " 3. **Display Math Formatting:** This rule is critical. Display math blocks MUST be formatted strictly as follows: a blank line, the opening `$$` on its own line, the LaTeX content, the closing `$$` on its own line, and a blank line.\n", |
729 | 740 | " - **Incorrect:** `...text $$x=y$$ more text...`\n", |
| 741 | + " - **Incorrect:** `...text$$x=y$$\\nmore text...`\n", |
| 742 | + " - **Incorrect:** `...text\\n$$x=y$$more text...`\n", |
730 | 743 | " - **Incorrect:** `...text\\n$$\\nx=y\\n\\n$$\\nmore text...`\n", |
731 | 744 | " - **Correct:** `...text\\n\\n$$\\nx=y\\n$$\\n\\nmore text...`\n", |
732 | 745 | " 4. **LaTeX Environments:** Environments like `aligned`, `cases`, `matrix`, `gathered`, etc., must be entirely contained within a single display math block (`$$...$$`). Ensure that every `\\begin{...}` has a matching `\\end{...}`.\n", |
|
786 | 799 | " part_solution_validation_data = {\n", |
787 | 800 | " \"part_solution\": part_solution\n", |
788 | 801 | " }\n", |
789 | | - " \n", |
| 802 | + "\n", |
790 | 803 | " validation_prompt = f\"\"\"\n", |
791 | 804 | " Your task is to extract a JSON with the following structure exactly, to be parsed by a pydantic model:\n", |
792 | 805 | " {part_solution_parser.get_format_instructions()}\n", |
|
937 | 950 | " dict: A dictionary containing the keys \"name\" and \"exercise\".\n", |
938 | 951 | " If parsing fails, returns None.\n", |
939 | 952 | " \"\"\"\n", |
940 | | - " corrected_md_content = correct_mistakes_in_markdown(md_content)\n", |
941 | | - " print(\"Markdown content corrected for spelling, grammar, and structure.\")\n", |
| 953 | + " # corrected_md_content = correct_mistakes_in_markdown(md_content)\n", |
| 954 | + " # print(\"Markdown content corrected for spelling, grammar, and structure.\")\n", |
942 | 955 | "\n", |
943 | | - " questions_dict = extract_questions(corrected_md_content)\n", |
| 956 | + " questions_dict = extract_questions(md_content)\n", |
944 | 957 | " print(\"successfully extracted the questions from the markdown. Now extracting the parts...\")\n", |
945 | 958 | "\n", |
946 | 959 | " extracted_dict = extract_parts_question(questions_dict)\n", |
|
965 | 978 | "metadata": {}, |
966 | 979 | "outputs": [], |
967 | 980 | "source": [ |
968 | | - "imported_tutorial = md_to_json(md_content)" |
| 981 | + "full_json_question_set = md_to_json(md_content)" |
969 | 982 | ] |
970 | 983 | }, |
971 | 984 | { |
|
984 | 997 | "outputs": [], |
985 | 998 | "source": [ |
986 | 999 | "# Extract title\n", |
987 | | - "title = imported_tutorial[\"name\"] + \" \" + imported_tutorial[\"year\"]\n", |
| 1000 | + "title = full_json_question_set[\"name\"] + \" \" + full_json_question_set[\"year\"]\n", |
988 | 1001 | "\n", |
989 | 1002 | "# Print the title\n", |
990 | 1003 | "print(f\"Title: {title}\\n\")\n", |
991 | 1004 | "\n", |
992 | 1005 | "# Extract questions\n", |
993 | | - "questions = imported_tutorial[\"questions\"]\n", |
994 | | - "\n", |
995 | | - "print(questions)\n", |
| 1006 | + "questions = full_json_question_set[\"questions\"]\n", |
996 | 1007 | "\n", |
997 | 1008 | "# Loop over and print each question\n", |
998 | | - "for idx1, question in enumerate(questions, start=1):\n", |
999 | | - " print(f\"**Question {idx1}**:\\n{question.get('title')}\\n\")\n", |
| 1009 | + "for question_idx, question in enumerate(questions, start=1):\n", |
| 1010 | + " print(f\"**Question {question_idx}**:\\n{question.get('title')}\\n\")\n", |
1000 | 1011 | " print(f\"Content: {question.get('content')}\\n\")\n", |
1001 | | - " for idx2, (part, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n", |
1002 | | - " print(f\"Question {idx1}:\")\n", |
1003 | | - " print(f\"- Subquestion {idx2}: {part}\")\n", |
1004 | | - " print(f\"- Worked Solution {idx2}: {part_answer}\")\n", |
| 1012 | + " for part_idx, (part_question, part_answer) in enumerate(zip(question.get(\"parts\", []), question.get(\"parts_solutions\", [])), start=1):\n", |
| 1013 | + " print(f\"Question {question_idx}:\")\n", |
| 1014 | + " print(f\"- Subquestion {part_idx}: {part_question}\")\n", |
| 1015 | + " print(f\"- Worked Solution {part_idx}: {part_answer}\")\n", |
1005 | 1016 | " print(\"\\n\")\n", |
1006 | 1017 | " print(\"-\" * 40) # Separator for readability" |
1007 | 1018 | ] |
|
1021 | 1032 | "metadata": {}, |
1022 | 1033 | "outputs": [], |
1023 | 1034 | "source": [ |
1024 | | - "questions = imported_tutorial[\"questions\"]\n", |
| 1035 | + "questions = full_json_question_set[\"questions\"]\n", |
1025 | 1036 | "\n", |
1026 | 1037 | "in2lambda_questions = []\n", |
1027 | 1038 | "\n", |
1028 | 1039 | "# Loop over all questions and question_answers and use in2lambda API to create a JSON.\n", |
1029 | | - "for idx, question_dict in enumerate(questions, start=1):\n", |
| 1040 | + "for question_idx, question_dict in enumerate(questions, start=1):\n", |
1030 | 1041 | " parts = []\n", |
1031 | 1042 | " for part_question, part_solution in zip(question_dict.get(\"parts\", []), question_dict.get(\"parts_solutions\", [])):\n", |
1032 | 1043 | " part_obj = Part(\n", |
|
1048 | 1059 | " print(f\"Warning: Image file not found: {full_path}\")\n", |
1049 | 1060 | "\n", |
1050 | 1061 | " question = Question(\n", |
1051 | | - " title=question_dict.get(\"title\", f\"Question {idx}\"),\n", |
| 1062 | + " title=question_dict.get(\"title\", f\"Question {question_idx}\"),\n", |
1052 | 1063 | " main_text=question_dict.get(\"content\", \"\"),\n", |
1053 | 1064 | " parts=parts,\n", |
1054 | 1065 | " images=image_paths\n", |
|
0 commit comments