|
279 | 279 | "outputs": [], |
280 | 280 | "source": [ |
281 | 281 | "# Define the schema for the tutorial output.\n", |
282 | | - "class Exercise(BaseModel):\n", |
| 282 | + "class Set_Question(BaseModel):\n", |
283 | 283 | " title: str = Field(..., description=\"Title of the exercise (only the text, no numbering)\")\n", |
284 | 284 | " content: str = Field(..., description=\"Content of the exercise (no exercise title, no subquestions)\")\n", |
285 | 285 | " subquestions: list[str] = Field(..., description=\"List of subquestions within the exercise (only the text, no numbering)\")\n", |
286 | | - " worked_solution_answers: list[str] = Field(..., description=\"List of answers to the exercise (only the text, no numbering)\")\n", |
287 | 286 | " \n", |
288 | | - "class Tutorial(BaseModel):\n", |
289 | | - " name: str = Field(..., description=\"Title of the tutorial\")\n", |
290 | | - " year: str = Field(..., description=\"Year of the tutorial\")\n", |
291 | | - " exercises: list[Exercise] = Field(..., description=\"List of tutorial questions\")\n", |
| 287 | + "class Set_Answer(BaseModel):\n", |
| 288 | + " title: str = Field(..., description=\"Title of the exercise (only the text, no numbering)\")\n", |
| 289 | + " workedSolutions: list[str] = Field(..., description=\"List of worked solution to subquestions within the exercise (no numbering or counting)\")\n", |
| 290 | + "\n", |
| 291 | + "class Set(BaseModel):\n", |
| 292 | + " name: str = Field(..., description=\"Title of the set\")\n", |
| 293 | + " year: str = Field(..., description=\"Year of the set\")\n", |
| 294 | + " exercise: list[Set_Question] = Field(..., description=\"List of exercises in the set\")\n", |
| 295 | + " workedSolution: list[Set_Answer] = Field(..., description=\"List of worked solutions for the exercises in the set\")\n", |
292 | 296 | "\n", |
293 | 297 | "def extract_tutorial_questions(doc_page_content: str) -> dict:\n", |
294 | 298 | " \"\"\"\n", |
|
302 | 306 | " \"name\": \"<title of tutorial>\",\n", |
303 | 307 | " \"year\": \"<year of tutorial>\",\n", |
304 | 308 | " \"exercise\": [\n", |
305 | | - " { title: \"exercise text 1\", content: \"content text exercise 1\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...], worked_solution_answers: [\"worked solution answer 1\", \"worked solution answer 2\", ...] },\n", |
306 | | - " { title: \"exercise text 2\", content: \"content text exercise 2\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...], worked_solution_answers: [\"worked solution answer 1\", \"worked solution answer 2\", ...] },\n", |
| 309 | + " { title: \"exercise text 1\", content: \"content text exercise 1\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...],\n", |
| 310 | + " { title: \"exercise text 2\", content: \"content text exercise 2\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...],\n", |
| 311 | + " ...\n", |
| 312 | + " ]\n", |
| 313 | + " \"workedSolution\": [\n", |
| 314 | + " { title: \"exercise text 1\", workedSolutions: [\"solution text 1\", \"solution text 2\", ...] },\n", |
| 315 | + " { title: \"exercise text 2\", workedSolutions: [\"solution text 1\", \"solution text 2\", ...] },\n", |
307 | 316 | " ...\n", |
308 | 317 | " ]\n", |
309 | 318 | " }\n", |
310 | 319 | " \n", |
311 | | - " The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter\n", |
312 | 320 | " the original text of the exercises. The function returns a dictionary parsed from the JSON output.\n", |
313 | | - " the questions, parts and their respective answers/worked solutions may not be in the same area, and may not even exist, in which case the function will return empty strings for those fields.\n", |
| 321 | + " if any of the text mentions a figure/diagram, then also find the figure and add it to the content of the exercise.\n", |
314 | 322 | " \n", |
315 | 323 | " Args:\n", |
316 | | - " doc_page_content (str): The content of the tutorial sheet.\n", |
| 324 | + " doc_page_content (str): The content of a set.\n", |
317 | 325 | " \n", |
318 | 326 | " Returns:\n", |
319 | 327 | " dict: A dictionary containing the keys \"name\" and \"exercise\".\n", |
320 | 328 | " If parsing fails, returns None.\n", |
321 | 329 | " \"\"\"\n", |
322 | 330 | " # Initialize the output parser with the Tutorial schema.\n", |
323 | | - " parser = PydanticOutputParser(pydantic_object=Tutorial)\n", |
| 331 | + " parser = PydanticOutputParser(pydantic_object=Set)\n", |
324 | 332 | "\n", |
325 | 333 | " # Construct the prompt, appending the parser's format instructions.\n", |
326 | 334 | " prompt = f\"\"\"\n", |
327 | | - " IMPORTED_TUTORIAL\n", |
| 335 | + " IMPORTED_SET\n", |
328 | 336 | " ```markdown\n", |
329 | 337 | " {doc_page_content}\n", |
330 | 338 | " ```\n", |
331 | 339 | "\n", |
332 | | - " IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may\n", |
333 | | - " not include reference solutions. Please infer the title of the tutorial from\n", |
334 | | - " the content, if no suitable name found, just leave as Tutorial 0.0, and extract each individual question as a separate string. Do\n", |
335 | | - " not modify the text of the exercises. Only use $...$ for math expressions.\n", |
| 340 | + " IMPORTED_SET is a set of questions. It may or may not include reference solutions.\n", |
| 341 | + " Infer the title of the set from the content, if no suitable name found, just leave as Unnamed Set, and extract each individual question as a separate string.\n", |
| 342 | + " Do not modify the text of the exercises. \n", |
| 343 | + " Only use $...$ for math expressions.\n", |
336 | 344 | "\n", |
337 | | - " If the exercise mentions figures, then find all the captions of figures (no links). \n", |
338 | | - " Keep the captions as \"Figure Q1 - ...\".\n", |
| 345 | + " If the exercise mentions figures/diagrams, then find the diagram (the local path) that it is talking about,\n", |
| 346 | + " and include it in the content of the exercise.\n", |
339 | 347 | "\n", |
340 | 348 | " If the exercise mentions tables, then include the table in the content.\n", |
341 | 349 | "\n", |
| 350 | + " Ensure that there is a workedSolution for each exercise, which should have the same title and a list of solutions that matches the subquestions.\n", |
| 351 | + "\n", |
342 | 352 | " Return a valid JSON string with the following structure:\n", |
343 | 353 | " {parser.get_format_instructions()}\n", |
344 | 354 | " \"\"\"\n", |
|
384 | 394 | "print(f\"Title: {title}\\n\")\n", |
385 | 395 | "\n", |
386 | 396 | "# Extract questions\n", |
387 | | - "questions = imported_tutorial[\"exercises\"]\n", |
| 397 | + "questions = imported_tutorial[\"exercise\"]\n", |
| 398 | + "solutions = imported_tutorial[\"workedSolution\"]\n", |
388 | 399 | "\n", |
389 | 400 | "# Loop over and print each question\n", |
390 | | - "for idx, question in enumerate(questions, start=1):\n", |
391 | | - " print(f\"**Question {idx}**:\\n{question.get(\"title\")}\\n\")\n", |
| 401 | + "for idx1, (question, solution) in enumerate(zip(questions, solutions), start=1):\n", |
| 402 | + " print(f\"**Question {idx1}**:\\n{question.get(\"title\")}\\n\")\n", |
392 | 403 | " print(f\"Content: {question.get(\"content\")}\\n\")\n", |
393 | | - " print(\"Subquestions:\")\n", |
394 | | - " for subquestion in question.get(\"subquestions\", []):\n", |
395 | | - " print(f\"- {subquestion}\")\n", |
396 | | - " print(\"Worked Solution Answers:\")\n", |
397 | | - " for answer in question.get(\"worked_solution_answers\", []):\n", |
398 | | - " print(f\"- {answer}\")\n", |
| 404 | + " for idx2, (subquestion, subanswer) in enumerate(zip(question.get(\"subquestions\", []), solution.get(\"workedSolutions\", [])), start=1):\n", |
| 405 | + " print(f\"Question {idx1}:\")\n", |
| 406 | + " print(f\"- Subquestion {idx2}: {subquestion}\")\n", |
| 407 | + " print(f\"- Worked Solution {idx2}: {subanswer}\")\n", |
| 408 | + " print(\"\\n\")\n", |
399 | 409 | " print(\"-\" * 40) # Separator for readability" |
400 | 410 | ] |
401 | 411 | }, |
|
438 | 448 | " publish: bool = Field(..., description=\"Publish flag\")\n", |
439 | 449 | " title: str = Field(..., description=\"Question title\")\n", |
440 | 450 | "\n", |
441 | | - "def create_question_json(question: str) -> dict:\n", |
| 451 | + "def create_question_json(question: str, solution: str) -> dict:\n", |
442 | 452 | " # Initialize the output parser using the defined Pydantic model.\n", |
443 | 453 | " parser = PydanticOutputParser(pydantic_object=QuestionJson)\n", |
444 | 454 | "\n", |
|
480 | 490 | " {question}\n", |
481 | 491 | " ```\n", |
482 | 492 | "\n", |
| 493 | + " IMPORTED_SOLUTION\n", |
| 494 | + " ```markdown\n", |
| 495 | + " {solution}\n", |
| 496 | + " ```\n", |
| 497 | + "\n", |
483 | 498 | " Preserve the markdown math formatting to use $...$ for math expressions. Do not modify the original text of the question.\n", |
484 | 499 | "\n", |
485 | 500 | " Infer the final answer and put it in the answerContent field of the part. \n", |
|
511 | 526 | "metadata": {}, |
512 | 527 | "outputs": [], |
513 | 528 | "source": [ |
514 | | - "questions = imported_tutorial[\"exercises\"]\n", |
| 529 | + "questions = imported_tutorial[\"exercise\"]\n", |
| 530 | + "solutions = imported_tutorial[\"workedSolution\"]\n", |
| 531 | + "\n", |
515 | 532 | "\n", |
516 | 533 | "# Loop over all questions and question_answers and print each question\n", |
517 | | - "for idx, question in zip(range(1, len(questions)+1), questions):\n", |
| 534 | + "for idx, (question, solution) in enumerate(zip(questions, solutions), start=1):\n", |
518 | 535 | " print(f\"**Question {idx}**:\\n{question}\\n\")\n", |
519 | 536 | " # print(f\"**Question Answers {idx}**:\\n{question_ans}\\n\")\n", |
520 | 537 | "\n", |
521 | 538 | " print(\"INFO: Mapping question in markdown into JSON\")\n", |
522 | | - " question_json = create_question_json(question)\n", |
| 539 | + " question_json = create_question_json(question,solution)\n", |
523 | 540 | " question_json[\"orderNumber\"] = idx-1\n", |
524 | 541 | " print(f\"INFO: JSON {idx}:\\n{question_json}\\n\")\n", |
525 | 542 | " \n", |
526 | | - " print(\"INFO: Get figures\")\n", |
| 543 | + " # print(\"INFO: Get figures\")\n", |
527 | 544 | " # updated_question_json = add_figure_references_to_questions(figures, question_json)\n", |
528 | 545 | " # updated_question_json = add_local_figures_to_questions(figures, question_json)\n", |
529 | 546 | " updated_question_json = question_json\n", |
530 | 547 | "\n", |
531 | 548 | " question_name = updated_question_json[\"title\"].replace(\" \", \"_\")\n", |
532 | 549 | " question_index = f\"{(idx-1):03}\" \n", |
533 | | - " filename = f\"{set_path}question_{question_index}_{question_name}.json\"\n", |
| 550 | + " filename = f\"{output_path}/question_{question_index}_{question_name}.json\"\n", |
534 | 551 | " print(f\"INFO: writing {filename}\")\n", |
535 | 552 | " open(filename, \"w\").write(json.dumps(updated_question_json, indent=2))\n", |
536 | 553 | " \n", |
|
0 commit comments