From 6035d3d2538cafc3881a7f841f6c37f555443efb Mon Sep 17 00:00:00 2001 From: esteban Date: Sun, 7 Dec 2025 04:57:38 -0600 Subject: [PATCH 1/3] dataset: synthetic pendulum data --- .gitignore | 1 + pyhealth/datasets/__init__.py | 1 + pyhealth/datasets/pendulum_data.py | 152 +++++++++++++++++++++++++++++ pyproject.toml | 3 + tests/core/test_pendulum.py | 38 ++++++++ 5 files changed, 195 insertions(+) create mode 100644 pyhealth/datasets/pendulum_data.py create mode 100644 tests/core/test_pendulum.py diff --git a/.gitignore b/.gitignore index 4ed3c760b..f315f9501 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,4 @@ leaderboard/rtd_token.txt # locally pre-trained models pyhealth/medcode/pretrained_embeddings/kg_emb/examples/pretrained_model data/physionet.org/ +.vscode \ No newline at end of file diff --git a/pyhealth/datasets/__init__.py b/pyhealth/datasets/__init__.py index 7d6a65f16..2b5589241 100644 --- a/pyhealth/datasets/__init__.py +++ b/pyhealth/datasets/__init__.py @@ -61,6 +61,7 @@ def __init__(self, *args, **kwargs): from .mimic4 import MIMIC4CXRDataset, MIMIC4Dataset, MIMIC4EHRDataset, MIMIC4NoteDataset from .mimicextract import MIMICExtractDataset from .omop import OMOPDataset +from .pendulum_data import PendulumData from .sample_dataset import SampleDataset from .shhs import SHHSDataset from .sleepedf import SleepEDFDataset diff --git a/pyhealth/datasets/pendulum_data.py b/pyhealth/datasets/pendulum_data.py new file mode 100644 index 000000000..7fa0a40f1 --- /dev/null +++ b/pyhealth/datasets/pendulum_data.py @@ -0,0 +1,152 @@ +# Author(s): Esteban Benitez +# NetID(s): benitez5 +# Paper Title: Generative ODE Modeling with Known Unknowns +# Paper Link: https://arxiv.org/abs/2003.10775 +# Description: +# Implements a Gymnasium-based Pendulum dataset generator for sequence modeling, +# reproducing the environment, parameterization, and data layout described in the paper. + +import numpy as np +import gymnasium as gym +import skimage.transform +from tqdm import trange +from .base_dataset import BaseDataset +import logging + +logger = logging.getLogger(__name__) + + +class PendulumData(BaseDataset): + """ + Synthetic data generator for the OpenAI Gymnasium Pendulum environment. + + This class uses the Pendulum environment from OpenAI's Gymnasium to generate + sequences of pendulum states and rendered images. The resulting dataset is suitable + for learning and evaluating dynamical system models, such as generative ODEs. + + Parameters + ---------- + env_name : str + The name of the gymnasium environment to use (default: 'Pendulum-v1'). + render_mode : str + The render mode for the environment (e.g., "rgb_array"). + seed : int + Random seed for reproducibility. + data_size : int + Number of trajectories to simulate (number of pendulum sequences). + seq_len : int + Number of time steps per trajectory. + side : int + Size (pixels) of each rendered image (output shape: [side, side]). + friction : float, optional + Friction coefficient used during simulation (if supported by environment). + + References + ---------- + .. [1] Linial, Ori, Neta Ravid, Danny Eytan, and Uri Shalit. + "Generative ODE Modeling with Known Unknowns." + arXiv preprint arXiv:2003.10775 [cs.LG], 2020. https://arxiv.org/abs/2003.10775 + .. [2] GOKU GitHub repository: https://github.com/orilinial/GOKU + + Example + ------- + >>> args = { + ... "env_name": "Pendulum-v1", + ... "render_mode": "rgb_array", + ... "seed": 42, + ... "data_size": 100, + ... "seq_len": 50, + ... "side": 64, + ... "friction": 0.0, + ... } + >>> dataset = PendulumData(**args) + """ + + def __init__(self, **args): + self.env = gym.make(args.get("Pendulum-v1"), render_mode=args["render_mode"]).unwrapped + self.env.reset(seed=args.get("seed", 2)) + self.data = np.zeros( + (args["data_size"], args["seq_len"], args["side"], args["side"]) + ) + self.latent_data = np.zeros((args["data_size"], args["seq_len"], 2)) + self.params_data = [] + self.args = args + self.data_size = args["data_size"] + self.seq_len = args["seq_len"] + self.side = args["side"] + + def create_pendulum_data(self): + for trial in trange(self.data_size): + reset_env(self.env, self.args) + params = get_params() + unlearned_params = get_unlearned_params() + + for step in range(self.seq_len): + processed_frame = preproc(self.env.render(), self.side) + self.data[trial, step] = processed_frame + obs = step_env(self.args, self.env, [0.0], params, unlearned_params) + + self.latent_data[trial, step, 0] = get_theta(obs) + self.latent_data[trial, step, 1] = obs[-1] + + self.params_data.append(params) + + self.env.close() + return self.data, self.latent_data, self.params_data + + +def get_theta(obs): + """Transforms coordinate basis from the defaults of the gym pendulum env.""" + theta = np.arctan2(obs[0], -obs[1]) + theta = theta + np.pi / 2 + theta = theta + 2 * np.pi if theta < -np.pi else theta + theta = theta - 2 * np.pi if theta > np.pi else theta + return theta + + +def preproc(X, side): + """Crops, downsamples, desaturates, etc. the rgb pendulum observation.""" + X = X[..., 0][220:-110, 165:-165] - X[..., 1][220:-110, 165:-165] + return skimage.transform.resize(X, [int(side), side]) / 255.0 + + +def step_env(args, env, u, params, additional_params): + th, thdot = env.state + + g = 10.0 + m = 1.0 + b = additional_params["b"] + l = params["l"] + dt = env.dt + + if args["friction"]: + newthdot = thdot + ((-g / l) * np.sin(th + np.pi) - (b / m) * thdot) * dt + else: + newthdot = thdot + ((-g / l) * np.sin(th + np.pi)) * dt + + newth = th + newthdot * dt + newthdot = np.clip(newthdot, -env.max_speed, env.max_speed) + + env.state = np.array([newth, newthdot]) + return env._get_obs() + + +def get_params(): + l = np.random.uniform(1.0, 2.0) + params = {"l": l} + return params + + +def get_unlearned_params(): + b = 0.7 + params = {"b": b} + return params + + +def reset_env(env, args, min_angle=0.0, max_angle=np.pi / 6): + angle_ok = False + while not angle_ok: + obs, info = env.reset() + theta_init = np.abs(get_theta(obs)) + if min_angle < theta_init < max_angle: + angle_ok = True diff --git a/pyproject.toml b/pyproject.toml index 6ab207053..e59d94400 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,9 @@ dependencies = [ "pandas~=2.3.1", "pandarallel~=1.6.5", "pydantic~=2.11.7", + "gymnasium", + "scikit-image", + "pygame" ] license = "BSD-3-Clause" license-files = ["LICENSE.md"] diff --git a/tests/core/test_pendulum.py b/tests/core/test_pendulum.py new file mode 100644 index 000000000..1ff9e8a57 --- /dev/null +++ b/tests/core/test_pendulum.py @@ -0,0 +1,38 @@ +import unittest +from pyhealth.datasets import PendulumData + + +class TestPendulumData(unittest.TestCase): + """Test cases for OpenAI Pendulum Dataset.""" + + def setUp(self): + self.args = { + "env_name": "Pendulum-v1", + "render_mode": "rgb_array", + "seed": 1, + "data_size": 10, + "seq_len": 2, + "side": 1, + "friction": 0, + } + + def test_get_data(self): + pd = PendulumData(**self.args) + data, latent_data, params_data = pd.create_pendulum_data() + self.assertListEqual( + data.tolist(), + [ + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.0]], [[0.0]]], + [[[0.00021478441553081915]], [[0.00021086596582142346]]], + ], + ) + self.assertEqual(latent_data.shape, (10, 2, 2)) + self.assertGreater(params_data[0]["l"], 0) From 29699c7f5ac95a2ba6ac2e42d233656fc2c2a7d3 Mon Sep 17 00:00:00 2001 From: esteban Date: Sun, 7 Dec 2025 05:13:25 -0600 Subject: [PATCH 2/3] fix --- pyhealth/datasets/pendulum_data.py | 4 +++- tests/core/test_pendulum.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pyhealth/datasets/pendulum_data.py b/pyhealth/datasets/pendulum_data.py index 7fa0a40f1..e774fd74d 100644 --- a/pyhealth/datasets/pendulum_data.py +++ b/pyhealth/datasets/pendulum_data.py @@ -63,7 +63,9 @@ class PendulumData(BaseDataset): """ def __init__(self, **args): - self.env = gym.make(args.get("Pendulum-v1"), render_mode=args["render_mode"]).unwrapped + self.env = gym.make( + args.get("env", "Pendulum-v1"), render_mode=args["render_mode"] + ).unwrapped self.env.reset(seed=args.get("seed", 2)) self.data = np.zeros( (args["data_size"], args["seq_len"], args["side"], args["side"]) diff --git a/tests/core/test_pendulum.py b/tests/core/test_pendulum.py index 1ff9e8a57..b41cd6c6a 100644 --- a/tests/core/test_pendulum.py +++ b/tests/core/test_pendulum.py @@ -16,6 +16,20 @@ def setUp(self): "friction": 0, } + def test_args(self): + self.assertDictEqual( + self.args, + { + "env_name": "Pendulum-v1", + "render_mode": "rgb_array", + "seed": 1, + "data_size": 10, + "seq_len": 2, + "side": 1, + "friction": 0, + }, + ) + def test_get_data(self): pd = PendulumData(**self.args) data, latent_data, params_data = pd.create_pendulum_data() From d9af1a5967a43c338f7fda5c8fbcd0c47e296494 Mon Sep 17 00:00:00 2001 From: Esteban Benitez Date: Sun, 7 Dec 2025 16:43:01 -0600 Subject: [PATCH 3/3] Update authors and NetIDs in pendulum_data.py --- pyhealth/datasets/pendulum_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyhealth/datasets/pendulum_data.py b/pyhealth/datasets/pendulum_data.py index e774fd74d..e8872f95b 100644 --- a/pyhealth/datasets/pendulum_data.py +++ b/pyhealth/datasets/pendulum_data.py @@ -1,5 +1,5 @@ -# Author(s): Esteban Benitez -# NetID(s): benitez5 +# Author(s): Esteban Benitez, Chloe Yang +# NetID(s): benitez5, junkey2 # Paper Title: Generative ODE Modeling with Known Unknowns # Paper Link: https://arxiv.org/abs/2003.10775 # Description: