fixed yml by also also upgrading virtualenv to allow correct, newer version of it to be used

HarrySu123 · HarrySu123 · commit 75842cb15375 · 2025-07-07T16:56:36.000+01:00
diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
@@ -27,7 +27,7 @@ jobs:
         sudo apt install -y texlive-latex-extra graphviz
     - name: Install Python Dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip setuptools wheel virtualenv
         pip install poetry
         poetry config virtualenvs.create false
         poetry install
diff --git a/wizard/sandbox_mathpix_2025.ipynb b/wizard/sandbox_mathpix_2025.ipynb
@@ -279,16 +279,20 @@
    "outputs": [],
    "source": [
     "# Define the schema for the tutorial output.\n",
-    "class Exercise(BaseModel):\n",
+    "class Set_Question(BaseModel):\n",
     "    title: str = Field(..., description=\"Title of the exercise (only the text, no numbering)\")\n",
     "    content: str = Field(..., description=\"Content of the exercise (no exercise title, no subquestions)\")\n",
     "    subquestions: list[str] = Field(..., description=\"List of subquestions within the exercise (only the text, no numbering)\")\n",
-    "    worked_solution_answers: list[str] = Field(..., description=\"List of answers to the exercise (only the text, no numbering)\")\n",
     "    \n",
-    "class Tutorial(BaseModel):\n",
-    "    name: str = Field(..., description=\"Title of the tutorial\")\n",
-    "    year: str = Field(..., description=\"Year of the tutorial\")\n",
-    "    exercises: list[Exercise] = Field(..., description=\"List of tutorial questions\")\n",
+    "class Set_Answer(BaseModel):\n",
+    "    title: str = Field(..., description=\"Title of the exercise (only the text, no numbering)\")\n",
+    "    workedSolutions: list[str] = Field(..., description=\"List of worked solution to subquestions within the exercise (no numbering or counting)\")\n",
+    "\n",
+    "class Set(BaseModel):\n",
+    "    name: str = Field(..., description=\"Title of the set\")\n",
+    "    year: str = Field(..., description=\"Year of the set\")\n",
+    "    exercise: list[Set_Question] = Field(..., description=\"List of exercises in the set\")\n",
+    "    workedSolution: list[Set_Answer] = Field(..., description=\"List of worked solutions for the exercises in the set\")\n",
     "\n",
     "def extract_tutorial_questions(doc_page_content: str) -> dict:\n",
     "    \"\"\"\n",
@@ -302,43 +306,49 @@
     "        \"name\": \"<title of tutorial>\",\n",
     "        \"year\": \"<year of tutorial>\",\n",
     "        \"exercise\": [\n",
-    "            { title: \"exercise text 1\", content: \"content text exercise 1\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...], worked_solution_answers: [\"worked solution answer 1\", \"worked solution answer 2\", ...] },\n",
-    "            { title: \"exercise text 2\", content: \"content text exercise 2\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...], worked_solution_answers: [\"worked solution answer 1\", \"worked solution answer 2\", ...] },\n",
+    "            { title: \"exercise text 1\", content: \"content text exercise 1\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...],\n",
+    "            { title: \"exercise text 2\", content: \"content text exercise 2\", subquestions: [\"subquestion text 1\", \"subquestion text 2\", ...],\n",
+    "            ...\n",
+    "        ]\n",
+    "        \"workedSolution\": [\n",
+    "            { title: \"exercise text 1\", workedSolutions: [\"solution text 1\", \"solution text 2\", ...] },\n",
+    "            { title: \"exercise text 2\", workedSolutions: [\"solution text 1\", \"solution text 2\", ...] },\n",
     "            ...\n",
     "        ]\n",
     "    }\n",
     "    \n",
-    "    The tutorial sheet (IMPORTED_TUTORIAL) may contain reference solutions; do not alter\n",
     "    the original text of the exercises. The function returns a dictionary parsed from the JSON output.\n",
-    "    the questions, parts and their respective answers/worked solutions may not be in the same area, and may not even exist, in which case the function will return empty strings for those fields.\n",
+    "    if any of the text mentions a figure/diagram, then also find the figure and add it to the content of the exercise.\n",
     "    \n",
     "    Args:\n",
-    "        doc_page_content (str): The content of the tutorial sheet.\n",
+    "        doc_page_content (str): The content of a set.\n",
     "        \n",
     "    Returns:\n",
     "        dict: A dictionary containing the keys \"name\" and \"exercise\".\n",
     "              If parsing fails, returns None.\n",
     "    \"\"\"\n",
     "    # Initialize the output parser with the Tutorial schema.\n",
-    "    parser = PydanticOutputParser(pydantic_object=Tutorial)\n",
+    "    parser = PydanticOutputParser(pydantic_object=Set)\n",
     "\n",
     "    # Construct the prompt, appending the parser's format instructions.\n",
     "    prompt = f\"\"\"\n",
-    "        IMPORTED_TUTORIAL\n",
+    "        IMPORTED_SET\n",
     "        ```markdown\n",
     "        {doc_page_content}\n",
     "        ```\n",
     "\n",
-    "        IMPORTED_TUTORIAL is a tutorial sheet with several exercises. It may or may\n",
-    "        not include reference solutions. Please infer the title of the tutorial from\n",
-    "        the content, if no suitable name found, just leave as Tutorial 0.0, and extract each individual question as a separate string. Do\n",
-    "        not modify the text of the exercises. Only use $...$ for math expressions.\n",
+    "        IMPORTED_SET is a set of questions. It may or may not include reference solutions.\n",
+    "        Infer the title of the set from the content, if no suitable name found, just leave as Unnamed Set, and extract each individual question as a separate string.\n",
+    "        Do not modify the text of the exercises. \n",
+    "        Only use $...$ for math expressions.\n",
     "\n",
-    "        If the exercise mentions figures, then find all the captions of figures (no links). \n",
-    "        Keep the captions as \"Figure Q1 - ...\".\n",
+    "        If the exercise mentions figures/diagrams, then find the diagram (the local path) that it is talking about,\n",
+    "        and include it in the content of the exercise.\n",
     "\n",
     "        If the exercise mentions tables, then include the table in the content.\n",
     "\n",
+    "        Ensure that there is a workedSolution for each exercise, which should have the same title and a list of solutions that matches the subquestions.\n",
+    "\n",
     "        Return a valid JSON string with the following structure:\n",
     "        {parser.get_format_instructions()}\n",
     "        \"\"\"\n",
@@ -384,18 +394,18 @@
     "print(f\"Title: {title}\\n\")\n",
     "\n",
     "# Extract questions\n",
-    "questions = imported_tutorial[\"exercises\"]\n",
+    "questions = imported_tutorial[\"exercise\"]\n",
+    "solutions = imported_tutorial[\"workedSolution\"]\n",
     "\n",
     "# Loop over and print each question\n",
-    "for idx, question in enumerate(questions, start=1):\n",
-    "    print(f\"**Question {idx}**:\\n{question.get(\"title\")}\\n\")\n",
+    "for idx1, (question, solution) in enumerate(zip(questions, solutions), start=1):\n",
+    "    print(f\"**Question {idx1}**:\\n{question.get(\"title\")}\\n\")\n",
     "    print(f\"Content: {question.get(\"content\")}\\n\")\n",
-    "    print(\"Subquestions:\")\n",
-    "    for subquestion in question.get(\"subquestions\", []):\n",
-    "        print(f\"- {subquestion}\")\n",
-    "    print(\"Worked Solution Answers:\")\n",
-    "    for answer in question.get(\"worked_solution_answers\", []):\n",
-    "        print(f\"- {answer}\")\n",
+    "    for idx2, (subquestion, subanswer) in enumerate(zip(question.get(\"subquestions\", []), solution.get(\"workedSolutions\", [])), start=1):\n",
+    "        print(f\"Question {idx1}:\")\n",
+    "        print(f\"- Subquestion {idx2}: {subquestion}\")\n",
+    "        print(f\"- Worked Solution {idx2}: {subanswer}\")\n",
+    "        print(\"\\n\")\n",
     "    print(\"-\" * 40)  # Separator for readability"
    ]
   },
@@ -438,7 +448,7 @@
     "    publish: bool = Field(..., description=\"Publish flag\")\n",
     "    title: str = Field(..., description=\"Question title\")\n",
     "\n",
-    "def create_question_json(question: str) -> dict:\n",
+    "def create_question_json(question: str, solution: str) -> dict:\n",
     "    # Initialize the output parser using the defined Pydantic model.\n",
     "    parser = PydanticOutputParser(pydantic_object=QuestionJson)\n",
     "\n",
@@ -480,6 +490,11 @@
     "      {question}\n",
     "      ```\n",
     "\n",
+    "      IMPORTED_SOLUTION\n",
+    "      ```markdown\n",
+    "      {solution}\n",
+    "      ```\n",
+    "\n",
     "      Preserve the markdown math formatting to use $...$ for math expressions. Do not modify the original text of the question.\n",
     "\n",
     "      Infer the final answer and put it in the answerContent field of the part. \n",
@@ -511,26 +526,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "questions = imported_tutorial[\"exercises\"]\n",
+    "questions = imported_tutorial[\"exercise\"]\n",
+    "solutions = imported_tutorial[\"workedSolution\"]\n",
+    "\n",
     "\n",
     "# Loop over all questions and question_answers and print each question\n",
-    "for idx, question in zip(range(1, len(questions)+1), questions):\n",
+    "for idx, (question, solution) in enumerate(zip(questions, solutions), start=1):\n",
     "    print(f\"**Question {idx}**:\\n{question}\\n\")\n",
     "    # print(f\"**Question Answers {idx}**:\\n{question_ans}\\n\")\n",
     "\n",
     "    print(\"INFO: Mapping question in markdown into JSON\")\n",
-    "    question_json = create_question_json(question)\n",
+    "    question_json = create_question_json(question,solution)\n",
     "    question_json[\"orderNumber\"] = idx-1\n",
     "    print(f\"INFO: JSON {idx}:\\n{question_json}\\n\")\n",
     "    \n",
-    "    print(\"INFO: Get figures\")\n",
+    "    # print(\"INFO: Get figures\")\n",
     "    # updated_question_json = add_figure_references_to_questions(figures, question_json)\n",
     "    # updated_question_json = add_local_figures_to_questions(figures, question_json)\n",
     "    updated_question_json = question_json\n",
     "\n",
     "    question_name = updated_question_json[\"title\"].replace(\" \", \"_\")\n",
     "    question_index = f\"{(idx-1):03}\" \n",
-    "    filename = f\"{set_path}question_{question_index}_{question_name}.json\"\n",
+    "    filename = f\"{output_path}/question_{question_index}_{question_name}.json\"\n",
     "    print(f\"INFO: writing {filename}\")\n",
     "    open(filename, \"w\").write(json.dumps(updated_question_json, indent=2))\n",
     "    \n",