From e6430b33e0bddccbe594814a7ed0f5529eb27858 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 22 Dec 2025 15:45:52 +0000 Subject: [PATCH 01/13] add dataset blending tool --- tools/README_discover_datasets.md | 278 ++++++++++++++ tools/discover_datasets.py | 578 ++++++++++++++++++++++++++++++ 2 files changed, 856 insertions(+) create mode 100644 tools/README_discover_datasets.md create mode 100644 tools/discover_datasets.py diff --git a/tools/README_discover_datasets.md b/tools/README_discover_datasets.md new file mode 100644 index 000000000..eea4b20b5 --- /dev/null +++ b/tools/README_discover_datasets.md @@ -0,0 +1,278 @@ +# Dataset Discovery Tool + +A tool to recursively discover datasets in a directory and generate a concatenated dataset configuration for Fast-LLM. + +## Overview + +This tool walks through a directory tree, identifies datasets by their `fast_llm_config*.yaml` files, and generates a configuration file that concatenates all discovered datasets. + +## Features + +- **Recursive Discovery**: Automatically finds all dataset configs in nested directories +- **Flexible Output**: Can use file references or inline full configs +- **Blended Datasets**: Option to create blended datasets with weights proportional to token counts + +## Usage + +### Command Line + +```bash +python tools/discover_datasets.py -o [options] +``` + +**Arguments:** + +- `directory`: Directory to search for datasets recursively (required) +- `-o, --output`: Output path for the generated config YAML file (required) +- `--no-file-refs`: Inline configs instead of using file references (optional) +- `--blended`: Create blended datasets with weights proportional to token counts (optional) + +**Examples:** + +```bash +# Basic usage - discover all datasets (concatenated) +python tools/discover_datasets.py /path/to/datasets -o combined_dataset.yaml + +# Create blended datasets with token-proportional weights +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --blended + +# Inline full configs instead of using file references +python tools/discover_datasets.py /path/to/datasets -o combined_dataset.yaml --no-file-refs +``` + +### Config File + +Create a config file: + +```yaml +# discover_config.yaml +directory: /path/to/datasets +output: combined_dataset.yaml +use_file_refs: true +use_blended: false # Set to true for blended datasets with token-proportional weights +``` + +Run with: + +```bash +python tools/discover_datasets.py --config discover_config.yaml +``` + +## Dataset Identification + +The tool identifies datasets by looking for files matching the pattern `fast_llm_config*.yaml`: + +- `fast_llm_config.yaml` - Unsplit dataset +- `fast_llm_config_training.yaml` - Training split +- `fast_llm_config_validation.yaml` - Validation split +- Any other `fast_llm_config_*.yaml` files + +These files are typically generated by the `fast-llm prepare` command during dataset preparation. + +## Output Formats + +### Concatenated Datasets (Default) + +The tool generates a concatenated dataset config that includes all discovered datasets: + +```yaml +type: concatenated +name: my_datasets +datasets: + - type: file + path: /path/to/dataset1/fast_llm_config_training.yaml + - type: file + path: /path/to/dataset1/fast_llm_config_validation.yaml + - type: file + path: /path/to/dataset2/fast_llm_config.yaml +``` + +With concatenated datasets, all datasets are combined sequentially - you'll see all samples from dataset1 first, then all from dataset2, etc. + +### Blended Datasets (with `--blended`) + +With the `--blended` flag, the tool creates a blended dataset config with weights proportional to the number of tokens in each dataset: + +```yaml +type: blended +name: my_datasets +datasets: + - type: file + path: /path/to/dataset1/fast_llm_config_training.yaml + - type: file + path: /path/to/dataset1/fast_llm_config_validation.yaml + - type: file + path: /path/to/dataset2/fast_llm_config.yaml +weights: + - 1500000 # Dataset 1 has 1.5M tokens + - 500000 # Dataset 2 has 500K tokens + - 2000000 # Dataset 3 has 2M tokens +``` + +With blended datasets, samples are drawn from each dataset proportionally to their weights during training. This means: + +- Larger datasets (more tokens) will be sampled more frequently +- Smaller datasets will be sampled less frequently +- The sampling is interleaved, not sequential +- Each dataset maintains its internal order, but samples from different datasets are mixed + +**Hierarchical blending:** When datasets are in nested directories, the tool automatically calculates proper token-proportional weights at all levels. Subdirectories are weighted by their total token count (sum of all datasets within them), ensuring accurate proportional sampling across the entire directory structure. + +**When to use blended vs concatenated:** + +- **Concatenated**: Use when you want to see all data from each dataset sequentially, or when combining dataset splits (train/val). +- **Blended**: Use when you want proportional sampling from multiple data sources during training (e.g., mixing code, books, and web data proportionally). + +### Using in Training Config + +Both formats can be used directly in a training config: + +```yaml +data: + datasets: + training: + type: file + path: combined_dataset.yaml +``` + +## Example Workflow + +### 1. Prepare Multiple Datasets + +```bash +# Prepare dataset 1 +fast-llm prepare --config dataset1_prepare.yaml + +# Prepare dataset 2 +fast-llm prepare --config dataset2_prepare.yaml + +# Prepare dataset 3 +fast-llm prepare --config dataset3_prepare.yaml +``` + +This creates a directory structure like: + +``` +my_datasets/ +├── dataset1/ +│ ├── fast_llm_config_training.yaml +│ ├── fast_llm_config_validation.yaml +│ ├── dataset1_training.fast_llm_dataset +│ └── dataset1_validation.fast_llm_dataset +├── dataset2/ +│ ├── fast_llm_config_training.yaml +│ ├── fast_llm_config_validation.yaml +│ ├── dataset2_training.fast_llm_dataset +│ └── dataset2_validation.fast_llm_dataset +└── dataset3/ + └── experiments/ + ├── fast_llm_config_training.yaml + └── dataset3_training.fast_llm_dataset +``` + +### 2. Discover and Combine Datasets + +```bash +python tools/discover_datasets.py my_datasets/ -o combined_datasets.yaml +``` + +This generates `combined_datasets.yaml`: + +```yaml +type: concatenated +name: my_datasets +datasets: + - type: file + path: my_datasets/dataset1/fast_llm_config_training.yaml + - type: file + path: my_datasets/dataset1/fast_llm_config_validation.yaml + - type: file + path: my_datasets/dataset2/fast_llm_config_training.yaml + - type: file + path: my_datasets/dataset2/fast_llm_config_validation.yaml + - type: file + path: my_datasets/dataset3/experiments/fast_llm_config_training.yaml +``` + +### 3. Use in Training Config + +```yaml +# training_config.yaml +model: + # ... model config ... + +data: + datasets: + training: + type: file + path: combined_datasets.yaml + sampling: + shuffle: skip_first_epoch + seed: 784569 + +# ... rest of training config ... +``` + +### 4. Train + +```bash +fast-llm train --config training_config.yaml +``` + +## Use Cases + +### 1. Combining Multiple Data Sources + +You have data from different sources (web scrapes, books, code, etc.) prepared separately: + +```bash +python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml +``` + +### 2. Incremental Data Addition + +You keep adding new datasets over time and want to automatically include all of them: + +```bash +# Just add new prepared datasets to the directory +# Re-run discovery to update the combined config +python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml +``` + +### 3. Experiment Organization + +You have experiments with different preprocessing or filtering: + +``` +experiments/ +├── baseline/ +│ ├── fast_llm_config_training.yaml +│ └── fast_llm_config_validation.yaml +├── filtered_v1/ +│ ├── fast_llm_config_training.yaml +│ └── fast_llm_config_validation.yaml +└── filtered_v2/ + ├── fast_llm_config_training.yaml + └── fast_llm_config_validation.yaml +``` + +```bash +python tools/discover_datasets.py experiments/ -o all_experiments.yaml +``` + +## Notes + +- **File References**: By default, the tool uses `type: file` references which lazily load the actual dataset configs. This keeps the generated config small and readable. + +- **Absolute Paths**: The tool uses absolute paths for file references to ensure configs work regardless of where they're used from. + +- **Ordering**: Datasets are discovered and ordered alphabetically by path for consistency. + +- **Empty Directories**: If no `fast_llm_config*.yaml` files are found, the tool will raise an error. + +- **All Files Included**: The tool concatenates ALL discovered config files. This means if you have both training and validation configs in the same directory, they will all be concatenated together. You may want to organize your directory structure accordingly or filter the specific configs you need after generation. + +## See Also + +- [Fast-LLM Data Configuration Documentation](../docs/recipes/data-configuration.md) +- [Dataset Preparation Guide](../docs/recipes/data-preparation.md) diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py new file mode 100644 index 000000000..630e45fb4 --- /dev/null +++ b/tools/discover_datasets.py @@ -0,0 +1,578 @@ +""" +Tool to recursively discover datasets in a directory and generate a concatenated dataset config. + +This tool walks through a directory tree, identifies datasets by their fast_llm_config*.yaml files, +and generates a config file that concatenates all discovered datasets. +""" + +import argparse +import logging +import pathlib + +import yaml + +from fast_llm.config import Field, config_class +from fast_llm.engine.config_utils.runnable import RunnableConfig + +logger = logging.getLogger(__name__) + + +def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path] | None = None) -> list[pathlib.Path]: + """ + Recursively find all fast_llm_config*.yaml files in the directory tree. + + Args: + root_dir: Root directory to search + ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) + + Returns: + List of paths to fast_llm_config*.yaml files + """ + config_files = [] + + # Normalize ignore paths to absolute paths + ignore_paths_absolute = set() + if ignore_paths: + for ignore_path in ignore_paths: + if ignore_path.is_absolute(): + ignore_paths_absolute.add(ignore_path.resolve()) + else: + ignore_paths_absolute.add((root_dir / ignore_path).resolve()) + + # Find all fast_llm_config*.yaml files + for config_file in root_dir.rglob("fast_llm_config*.yaml"): + # Check if this file should be ignored + config_file_resolved = config_file.resolve() + should_ignore = False + + for ignore_path in ignore_paths_absolute: + # Check if the config file is under an ignored path + try: + config_file_resolved.relative_to(ignore_path) + should_ignore = True + break + except ValueError: + # Not under this ignored path, continue checking + continue + + if not should_ignore: + config_files.append(config_file) + + # Sort by path for consistent ordering + config_files.sort() + + return config_files + + +def load_dataset_config(config_path: pathlib.Path) -> dict: + """ + Load a dataset config from a YAML file. + + Args: + config_path: Path to the config file + + Returns: + The loaded config as a dictionary + """ + with open(config_path) as f: + config = yaml.safe_load(f) + return config + + +def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: + """Read number of tokens from a memmap file.""" + import json + + from fast_llm.data.dataset.memmap import FILE_HEADER + from fast_llm.data.sample.abstract import MemmapIndexDatasetReaderConfig + + if not memmap_path.exists(): + logger.warning(f"Memmap file not found: {memmap_path}") + return 0 + + try: + with memmap_path.open("rb") as stream: + header = stream.read(len(FILE_HEADER)) + if header != FILE_HEADER: + logger.warning(f"Invalid memmap file format: {memmap_path}") + return 0 + stream.seek(int.from_bytes(stream.read(8), signed=False)) + config_bytes = stream.read(int.from_bytes(stream.read(4), signed=False)) + reader_config = MemmapIndexDatasetReaderConfig.from_dict(json.loads(config_bytes.decode("utf-8"))) + return reader_config.num_tokens + except Exception as e: + logger.warning(f"Failed to read memmap file {memmap_path}: {e}") + return 0 + + +def _resolve_path(path: str | pathlib.Path, relative_to: pathlib.Path) -> pathlib.Path: + """Resolve a path relative to a base directory if not absolute.""" + path = pathlib.Path(path) + return path if path.is_absolute() else relative_to / path + + +def _get_config_num_tokens(config_dict: dict, base_dir: pathlib.Path) -> int: + """Get number of tokens from a config dict (handles inline configs recursively).""" + dataset_type = config_dict.get("type") + + if dataset_type == "file": + file_path = _resolve_path(config_dict["path"], base_dir) + return get_dataset_num_tokens(file_path) + + if dataset_type == "memmap": + memmap_path = _resolve_path(config_dict.get("path", ""), base_dir) + return _read_memmap_num_tokens(memmap_path) + + if dataset_type in ["blended", "sampled", "concatenated"]: + return sum(_get_config_num_tokens(sub, base_dir) for sub in config_dict.get("datasets", [])) + + if dataset_type == "slice": + base_config = config_dict.get("dataset", {}) + begin = config_dict.get("begin", 0) + end = config_dict.get("end", 1) + base_tokens = _get_config_num_tokens(base_config, base_dir) + return int(base_tokens * (end - begin)) + + logger.warning(f"Unsupported inline config type '{dataset_type}'") + return 0 + + +def get_dataset_num_tokens(config_path: pathlib.Path) -> int: + """ + Load a dataset config and get its number of tokens. + + Args: + config_path: Path to the dataset config file + + Returns: + Number of tokens in the dataset + """ + # Import preprocessing and sample configs to register them + import fast_llm.data.preprocessing.image_patch # noqa + import fast_llm.data.preprocessing.language_model # noqa + import fast_llm.data.sample.language_model # noqa + import fast_llm.data.sample.patch # noqa + import fast_llm.data.sample.range # noqa + import fast_llm.data.sample.token # noqa + + config_dict = load_dataset_config(config_path) + return _get_config_num_tokens(config_dict, config_path.parent) + + +def create_concatenated_config( + config_files: list[pathlib.Path], + name: str = "concatenated", + use_file_refs: bool = True, + use_blended: bool = False, +) -> dict: + """ + Create a concatenated or blended dataset config from a list of config files. + + Args: + config_files: List of paths to dataset config files + name: Name for the concatenated/blended dataset + use_file_refs: If True, use file references (type: file, path: ...). + If False, inline the full configs. + use_blended: If True, create a blended dataset with weights proportional to token counts. + If False, create a concatenated dataset. + + Returns: + Dictionary representing a concatenated or blended dataset config + """ + if len(config_files) == 0: + raise ValueError("No config files provided") + + if len(config_files) == 1: + # If only one dataset, just reference it directly + if use_file_refs: + return { + "type": "file", + "path": str(config_files[0]), + } + else: + return load_dataset_config(config_files[0]) + + # Multiple datasets + datasets = [] + for config_file in config_files: + if use_file_refs: + datasets.append( + { + "type": "file", + "path": str(config_file), + } + ) + else: + datasets.append(load_dataset_config(config_file)) + + if use_blended: + # Get token counts for each dataset to calculate weights + logger.info("Calculating token counts for blended dataset weights...") + weights = [] + for config_file in config_files: + try: + num_tokens = get_dataset_num_tokens(config_file) + weights.append(num_tokens / 1e9) + logger.info(f" - {config_file.name}: {num_tokens:,} tokens") + except Exception as e: + logger.error(f"Failed to get token count for {config_file}: {e}") + # Use weight of 1 as fallback + weights.append(1) + logger.warning(f" - {config_file.name}: using fallback weight of 1") + + return { + "type": "blended", + "name": name, + "datasets": datasets, + "weights": weights, + } + else: + return { + "type": "concatenated", + "name": name, + "datasets": datasets, + } + + +def group_configs_by_directory( + config_files: list[pathlib.Path], root_dir: pathlib.Path +) -> dict[pathlib.Path, list[pathlib.Path]]: + """ + Group config files by their parent directory. + + Args: + config_files: List of config file paths + root_dir: Root directory to use for relative paths + + Returns: + Dictionary mapping directory paths to lists of config files in that directory + """ + groups: dict[pathlib.Path, list[pathlib.Path]] = {} + + for config_file in config_files: + parent_dir = config_file.parent + if parent_dir not in groups: + groups[parent_dir] = [] + groups[parent_dir].append(config_file) + + return groups + + +def build_directory_tree( + groups: dict[pathlib.Path, list[pathlib.Path]], root_dir: pathlib.Path +) -> dict[pathlib.Path, set[pathlib.Path]]: + """ + Build a tree structure of directories showing parent-child relationships. + + Args: + groups: Dictionary mapping directories to their config files + root_dir: Root directory + + Returns: + Dictionary mapping each directory to its immediate child directories + """ + tree: dict[pathlib.Path, set[pathlib.Path]] = {root_dir: set()} + + for directory in groups.keys(): + # Add all ancestors to the tree + current = directory + while current != root_dir and current.parent != current: + parent = current.parent + if parent not in tree: + tree[parent] = set() + if current not in tree: + tree[current] = set() + tree[parent].add(current) + current = parent + + return tree + + +def create_directory_config( + directory: pathlib.Path, + groups: dict[pathlib.Path, list[pathlib.Path]], + tree: dict[pathlib.Path, set[pathlib.Path]], + root_dir: pathlib.Path, + use_file_refs: bool, + use_blended: bool = False, +) -> tuple[dict, int] | None: + """ + Recursively create a config for a directory and its subdirectories. + + Args: + directory: Current directory to process + groups: Dictionary mapping directories to their config files + tree: Directory tree structure + root_dir: Root directory + use_file_refs: Whether to use file references + use_blended: Whether to create blended datasets instead of concatenated + + Returns: + Tuple of (config dictionary, total token count), or None if directory has no datasets + """ + local_datasets = [] + local_config_files = [] + local_tokens = [] + subdir_datasets = [] + subdir_tokens = [] + + # First, collect configs directly in this directory (not in subdirectories) + if directory in groups: + for config_file in sorted(groups[directory]): + if use_file_refs: + local_datasets.append( + { + "type": "file", + "path": str(config_file), + } + ) + else: + local_datasets.append(load_dataset_config(config_file)) + local_config_files.append(config_file) + + # Get token count for this dataset + try: + num_tokens = get_dataset_num_tokens(config_file) + local_tokens.append(num_tokens / 1e9) + except Exception as e: + logger.warning(f"Failed to get token count for {config_file}: {e}") + local_tokens.append(1) + + # Then, recursively process subdirectories + if directory in tree: + for subdir in sorted(tree[directory]): + subdir_result = create_directory_config(subdir, groups, tree, root_dir, use_file_refs, use_blended) + if subdir_result is not None: + subdir_config, subdir_token_count = subdir_result + subdir_datasets.append(subdir_config) + subdir_tokens.append(subdir_token_count) + + # If we have both local datasets and subdirectory datasets, group local ones first + if local_datasets and subdir_datasets: + # Create a group for local datasets if there are multiple + if len(local_datasets) > 1: + rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") + local_name = f"{str(rel_path).replace('/', '_').replace('.', root_dir.name)}_local" + local_total_tokens = sum(local_tokens) + + if use_blended: + local_group = { + "type": "blended", + "name": local_name, + "datasets": local_datasets, + "weights": local_tokens, + } + else: + local_group = { + "type": "concatenated", + "name": local_name, + "datasets": local_datasets, + } + all_datasets = [local_group] + subdir_datasets + all_tokens = [local_total_tokens] + subdir_tokens + else: + all_datasets = local_datasets + subdir_datasets + all_tokens = local_tokens + subdir_tokens + elif local_datasets: + all_datasets = local_datasets + all_tokens = local_tokens + elif subdir_datasets: + all_datasets = subdir_datasets + all_tokens = subdir_tokens + else: + return None + + # Calculate total tokens for this directory + total_tokens = sum(all_tokens) + + if len(all_datasets) == 1: + # Don't wrap a single dataset + return all_datasets[0], total_tokens + + # Multiple datasets - create concatenated or blended config + rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") + name = str(rel_path).replace("/", "_").replace(".", root_dir.name) + + if use_blended: + # Use the collected token counts as weights + return { + "type": "blended", + "name": name, + "datasets": all_datasets, + "weights": all_tokens, + }, total_tokens + else: + return { + "type": "concatenated", + "name": name, + "datasets": all_datasets, + }, total_tokens + + +def create_hierarchical_config( + root_dir: pathlib.Path, + use_file_refs: bool = True, + use_blended: bool = False, + ignore_paths: list[pathlib.Path] | None = None, +) -> dict: + """ + Create a hierarchical concatenated or blended dataset config from all datasets in a directory. + + Datasets in the same directory are grouped together, and these groups are nested + following the directory structure. + + Args: + root_dir: Root directory to search for datasets + use_file_refs: If True, use file references (type: file). + If False, inline the full configs. + use_blended: If True, create blended datasets with weights proportional to token counts. + If False, create concatenated datasets. + ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) + + Returns: + Dictionary representing the hierarchical dataset config + """ + logger.info(f"Discovering datasets in {root_dir}...") + + if ignore_paths: + logger.info(f"Ignoring {len(ignore_paths)} path(s):") + for ignore_path in ignore_paths: + logger.info(f" - {ignore_path}") + + config_files = find_dataset_configs(root_dir, ignore_paths=ignore_paths) + + if not config_files: + raise ValueError(f"No fast_llm_config*.yaml files found in {root_dir}") + + logger.info(f"Found {len(config_files)} dataset config(s):") + for config_file in config_files: + logger.info(f" - {config_file.relative_to(root_dir)}") + + # Group configs by directory + groups = group_configs_by_directory(config_files, root_dir) + + # Build directory tree + tree = build_directory_tree(groups, root_dir) + + # Create hierarchical config + result = create_directory_config(root_dir, groups, tree, root_dir, use_file_refs, use_blended) + + if result is None: + raise ValueError("Failed to create config") + + config, total_tokens = result + + if use_blended: + logger.info(f"Total tokens across all datasets: {total_tokens:,}") + + return config + + +@config_class() +class DiscoverDatasetsConfig(RunnableConfig): + """ + Configuration for the dataset discovery tool. + """ + + directory: pathlib.Path = Field(desc="Directory to search for datasets recursively") + output: pathlib.Path = Field(desc="Output path for the generated config YAML file") + use_file_refs: bool = Field(default=True, desc="Use file references (type: file) instead of inlining configs") + use_blended: bool = Field( + default=False, + desc="Create blended datasets with weights proportional to token counts instead of concatenated datasets", + ) + ignore_paths: list[pathlib.Path] = Field( + default_factory=list, + desc="List of paths to ignore during dataset discovery (can be absolute or relative to directory)", + ) + + def run(self): + """ + Run the dataset discovery tool. + """ + # Validate directory exists + if not self.directory.exists(): + raise ValueError(f"Directory does not exist: {self.directory}") + + if not self.directory.is_dir(): + raise ValueError(f"Path is not a directory: {self.directory}") + + # Generate the hierarchical config + config = create_hierarchical_config( + self.directory.resolve(), + use_file_refs=self.use_file_refs, + use_blended=self.use_blended, + ignore_paths=self.ignore_paths, + ) + + # Write the config to the output file + self.output.parent.mkdir(parents=True, exist_ok=True) + with open(self.output, "w") as f: + yaml.safe_dump(config, f, default_flow_style=False, sort_keys=False) + + logger.info(f"Generated dataset config saved to {self.output}") + + # Print a preview of the config + logger.info("\nGenerated config preview:") + preview = yaml.safe_dump(config, default_flow_style=False, sort_keys=False) + for line in preview.split("\n")[:50]: # Show first 50 lines + logger.info(line) + + if len(preview.split("\n")) > 50: + logger.info("... (truncated)") + + +def main(): + """ + Command-line entry point. + """ + parser = argparse.ArgumentParser( + description="Discover datasets and generate hierarchical concatenated or blended config" + ) + parser.add_argument("directory", type=pathlib.Path, help="Directory to search for datasets recursively") + parser.add_argument( + "-o", "--output", type=pathlib.Path, required=True, help="Output path for the generated config YAML file" + ) + parser.add_argument("--no-file-refs", action="store_true", help="Inline configs instead of using file references") + parser.add_argument( + "--blended", + action="store_true", + help="Create blended datasets with weights proportional to token counts instead of concatenated datasets", + ) + parser.add_argument( + "--ignore", + type=pathlib.Path, + action="append", + dest="ignore_paths", + help="Path to ignore during dataset discovery (can be specified multiple times)", + ) + + args = parser.parse_args() + + # Configure logging + logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") + + # Require --blended flag since concatenated datasets don't work properly + if not args.blended: + parser.error("--blended flag is required (concatenated datasets are currently not supported)") + + # Create and run the config + config = DiscoverDatasetsConfig( + directory=args.directory, + output=args.output, + use_file_refs=not args.no_file_refs, + use_blended=args.blended, + ignore_paths=args.ignore_paths or [], + ) + config.run() + + +if __name__ == "__main__": + # Support both CLI usage and Fast-LLM's config system + import sys + + # Check if using argparse-style CLI (positional arg without --config) + if len(sys.argv) > 1 and not sys.argv[1].startswith("-") and sys.argv[1] != "--config": + main() + else: + DiscoverDatasetsConfig.parse_and_run() From b2cb71036fa1e8d591df582832e771597d4a3623 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 22 Dec 2025 15:56:35 +0000 Subject: [PATCH 02/13] only blended. concatenated doesnt work --- tools/README_discover_datasets.md | 77 +++++++--------- tools/discover_datasets.py | 147 ++++++++++-------------------- 2 files changed, 83 insertions(+), 141 deletions(-) diff --git a/tools/README_discover_datasets.md b/tools/README_discover_datasets.md index eea4b20b5..2959dc33d 100644 --- a/tools/README_discover_datasets.md +++ b/tools/README_discover_datasets.md @@ -1,16 +1,16 @@ # Dataset Discovery Tool -A tool to recursively discover datasets in a directory and generate a concatenated dataset configuration for Fast-LLM. +A tool to recursively discover datasets in a directory and generate a blended dataset configuration for Fast-LLM. ## Overview -This tool walks through a directory tree, identifies datasets by their `fast_llm_config*.yaml` files, and generates a configuration file that concatenates all discovered datasets. +This tool walks through a directory tree, identifies datasets by their `fast_llm_config*.yaml` files, and generates a configuration file that blends all discovered datasets with weights proportional to token counts. ## Features - **Recursive Discovery**: Automatically finds all dataset configs in nested directories - **Flexible Output**: Can use file references or inline full configs -- **Blended Datasets**: Option to create blended datasets with weights proportional to token counts +- **Token-Proportional Blending**: Automatically calculates weights based on dataset token counts for proportional sampling ## Usage @@ -25,19 +25,19 @@ python tools/discover_datasets.py -o [options] - `directory`: Directory to search for datasets recursively (required) - `-o, --output`: Output path for the generated config YAML file (required) - `--no-file-refs`: Inline configs instead of using file references (optional) -- `--blended`: Create blended datasets with weights proportional to token counts (optional) +- `--ignore`: Path to ignore during dataset discovery (can be specified multiple times, optional) **Examples:** ```bash -# Basic usage - discover all datasets (concatenated) -python tools/discover_datasets.py /path/to/datasets -o combined_dataset.yaml - -# Create blended datasets with token-proportional weights -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --blended +# Basic usage - discover all datasets and create blended config +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml # Inline full configs instead of using file references -python tools/discover_datasets.py /path/to/datasets -o combined_dataset.yaml --no-file-refs +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --no-file-refs + +# Ignore specific paths during discovery +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --ignore experiments/old --ignore tmp ``` ### Config File @@ -47,9 +47,9 @@ Create a config file: ```yaml # discover_config.yaml directory: /path/to/datasets -output: combined_dataset.yaml +output: blended_dataset.yaml use_file_refs: true -use_blended: false # Set to true for blended datasets with token-proportional weights +ignore_paths: [] # Optional list of paths to ignore ``` Run with: @@ -69,29 +69,11 @@ The tool identifies datasets by looking for files matching the pattern `fast_llm These files are typically generated by the `fast-llm prepare` command during dataset preparation. -## Output Formats - -### Concatenated Datasets (Default) - -The tool generates a concatenated dataset config that includes all discovered datasets: - -```yaml -type: concatenated -name: my_datasets -datasets: - - type: file - path: /path/to/dataset1/fast_llm_config_training.yaml - - type: file - path: /path/to/dataset1/fast_llm_config_validation.yaml - - type: file - path: /path/to/dataset2/fast_llm_config.yaml -``` - -With concatenated datasets, all datasets are combined sequentially - you'll see all samples from dataset1 first, then all from dataset2, etc. +## Output Format -### Blended Datasets (with `--blended`) +### Blended Datasets -With the `--blended` flag, the tool creates a blended dataset config with weights proportional to the number of tokens in each dataset: +The tool generates a blended dataset config with weights proportional to the number of tokens in each dataset: ```yaml type: blended @@ -118,21 +100,22 @@ With blended datasets, samples are drawn from each dataset proportionally to the **Hierarchical blending:** When datasets are in nested directories, the tool automatically calculates proper token-proportional weights at all levels. Subdirectories are weighted by their total token count (sum of all datasets within them), ensuring accurate proportional sampling across the entire directory structure. -**When to use blended vs concatenated:** +**Benefits of blended datasets:** -- **Concatenated**: Use when you want to see all data from each dataset sequentially, or when combining dataset splits (train/val). -- **Blended**: Use when you want proportional sampling from multiple data sources during training (e.g., mixing code, books, and web data proportionally). +- **Proportional sampling**: Each dataset is sampled proportionally to its size, preventing smaller datasets from being underrepresented +- **Interleaved samples**: Unlike sequential concatenation, samples from different datasets are mixed during training +- **Automatic weight calculation**: No need to manually specify weights - they're calculated from token counts ### Using in Training Config -Both formats can be used directly in a training config: +The generated config can be used directly in a training config: ```yaml data: datasets: training: type: file - path: combined_dataset.yaml + path: blended_dataset.yaml ``` ## Example Workflow @@ -170,16 +153,16 @@ my_datasets/ └── dataset3_training.fast_llm_dataset ``` -### 2. Discover and Combine Datasets +### 2. Discover and Blend Datasets ```bash -python tools/discover_datasets.py my_datasets/ -o combined_datasets.yaml +python tools/discover_datasets.py my_datasets/ -o blended_datasets.yaml ``` -This generates `combined_datasets.yaml`: +This generates `blended_datasets.yaml`: ```yaml -type: concatenated +type: blended name: my_datasets datasets: - type: file @@ -192,6 +175,12 @@ datasets: path: my_datasets/dataset2/fast_llm_config_validation.yaml - type: file path: my_datasets/dataset3/experiments/fast_llm_config_training.yaml +weights: + - 1500.0 # Dataset 1 training: 1.5B tokens + - 500.0 # Dataset 1 validation: 500M tokens + - 2000.0 # Dataset 2 training: 2B tokens + - 800.0 # Dataset 2 validation: 800M tokens + - 3000.0 # Dataset 3 training: 3B tokens ``` ### 3. Use in Training Config @@ -205,7 +194,7 @@ data: datasets: training: type: file - path: combined_datasets.yaml + path: blended_datasets.yaml sampling: shuffle: skip_first_epoch seed: 784569 @@ -270,7 +259,7 @@ python tools/discover_datasets.py experiments/ -o all_experiments.yaml - **Empty Directories**: If no `fast_llm_config*.yaml` files are found, the tool will raise an error. -- **All Files Included**: The tool concatenates ALL discovered config files. This means if you have both training and validation configs in the same directory, they will all be concatenated together. You may want to organize your directory structure accordingly or filter the specific configs you need after generation. +- **All Files Included**: The tool blends ALL discovered config files with weights proportional to their token counts. This means if you have both training and validation configs in the same directory, they will all be included in the blended dataset. You may want to organize your directory structure accordingly or use the `--ignore` flag to exclude specific paths. ## See Also diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py index 630e45fb4..ddc6e3093 100644 --- a/tools/discover_datasets.py +++ b/tools/discover_datasets.py @@ -1,8 +1,8 @@ """ -Tool to recursively discover datasets in a directory and generate a concatenated dataset config. +Tool to recursively discover datasets in a directory and generate a blended dataset config. This tool walks through a directory tree, identifies datasets by their fast_llm_config*.yaml files, -and generates a config file that concatenates all discovered datasets. +and generates a config file that blends all discovered datasets with weights proportional to token counts. """ import argparse @@ -159,25 +159,22 @@ def get_dataset_num_tokens(config_path: pathlib.Path) -> int: return _get_config_num_tokens(config_dict, config_path.parent) -def create_concatenated_config( +def create_blended_config( config_files: list[pathlib.Path], - name: str = "concatenated", + name: str = "blended", use_file_refs: bool = True, - use_blended: bool = False, ) -> dict: """ - Create a concatenated or blended dataset config from a list of config files. + Create a blended dataset config from a list of config files. Args: config_files: List of paths to dataset config files - name: Name for the concatenated/blended dataset + name: Name for the blended dataset use_file_refs: If True, use file references (type: file, path: ...). If False, inline the full configs. - use_blended: If True, create a blended dataset with weights proportional to token counts. - If False, create a concatenated dataset. Returns: - Dictionary representing a concatenated or blended dataset config + Dictionary representing a blended dataset config """ if len(config_files) == 0: raise ValueError("No config files provided") @@ -205,33 +202,26 @@ def create_concatenated_config( else: datasets.append(load_dataset_config(config_file)) - if use_blended: - # Get token counts for each dataset to calculate weights - logger.info("Calculating token counts for blended dataset weights...") - weights = [] - for config_file in config_files: - try: - num_tokens = get_dataset_num_tokens(config_file) - weights.append(num_tokens / 1e9) - logger.info(f" - {config_file.name}: {num_tokens:,} tokens") - except Exception as e: - logger.error(f"Failed to get token count for {config_file}: {e}") - # Use weight of 1 as fallback - weights.append(1) - logger.warning(f" - {config_file.name}: using fallback weight of 1") - - return { - "type": "blended", - "name": name, - "datasets": datasets, - "weights": weights, - } - else: - return { - "type": "concatenated", - "name": name, - "datasets": datasets, - } + # Get token counts for each dataset to calculate weights + logger.info("Calculating token counts for blended dataset weights...") + weights = [] + for config_file in config_files: + try: + num_tokens = get_dataset_num_tokens(config_file) + weights.append(num_tokens / 1e9) + logger.info(f" - {config_file.name}: {num_tokens:,} tokens") + except Exception as e: + logger.error(f"Failed to get token count for {config_file}: {e}") + # Use weight of 1 as fallback + weights.append(1) + logger.warning(f" - {config_file.name}: using fallback weight of 1") + + return { + "type": "blended", + "name": name, + "datasets": datasets, + "weights": weights, + } def group_configs_by_directory( @@ -294,10 +284,9 @@ def create_directory_config( tree: dict[pathlib.Path, set[pathlib.Path]], root_dir: pathlib.Path, use_file_refs: bool, - use_blended: bool = False, ) -> tuple[dict, int] | None: """ - Recursively create a config for a directory and its subdirectories. + Recursively create a blended config for a directory and its subdirectories. Args: directory: Current directory to process @@ -305,7 +294,6 @@ def create_directory_config( tree: Directory tree structure root_dir: Root directory use_file_refs: Whether to use file references - use_blended: Whether to create blended datasets instead of concatenated Returns: Tuple of (config dictionary, total token count), or None if directory has no datasets @@ -341,7 +329,7 @@ def create_directory_config( # Then, recursively process subdirectories if directory in tree: for subdir in sorted(tree[directory]): - subdir_result = create_directory_config(subdir, groups, tree, root_dir, use_file_refs, use_blended) + subdir_result = create_directory_config(subdir, groups, tree, root_dir, use_file_refs) if subdir_result is not None: subdir_config, subdir_token_count = subdir_result subdir_datasets.append(subdir_config) @@ -355,19 +343,12 @@ def create_directory_config( local_name = f"{str(rel_path).replace('/', '_').replace('.', root_dir.name)}_local" local_total_tokens = sum(local_tokens) - if use_blended: - local_group = { - "type": "blended", - "name": local_name, - "datasets": local_datasets, - "weights": local_tokens, - } - else: - local_group = { - "type": "concatenated", - "name": local_name, - "datasets": local_datasets, - } + local_group = { + "type": "blended", + "name": local_name, + "datasets": local_datasets, + "weights": local_tokens, + } all_datasets = [local_group] + subdir_datasets all_tokens = [local_total_tokens] + subdir_tokens else: @@ -389,48 +370,38 @@ def create_directory_config( # Don't wrap a single dataset return all_datasets[0], total_tokens - # Multiple datasets - create concatenated or blended config + # Multiple datasets - create blended config rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") name = str(rel_path).replace("/", "_").replace(".", root_dir.name) - if use_blended: - # Use the collected token counts as weights - return { - "type": "blended", - "name": name, - "datasets": all_datasets, - "weights": all_tokens, - }, total_tokens - else: - return { - "type": "concatenated", - "name": name, - "datasets": all_datasets, - }, total_tokens + # Use the collected token counts as weights + return { + "type": "blended", + "name": name, + "datasets": all_datasets, + "weights": all_tokens, + }, total_tokens def create_hierarchical_config( root_dir: pathlib.Path, use_file_refs: bool = True, - use_blended: bool = False, ignore_paths: list[pathlib.Path] | None = None, ) -> dict: """ - Create a hierarchical concatenated or blended dataset config from all datasets in a directory. + Create a hierarchical blended dataset config from all datasets in a directory. - Datasets in the same directory are grouped together, and these groups are nested - following the directory structure. + Datasets in the same directory are grouped together with weights proportional to token counts, + and these groups are nested following the directory structure. Args: root_dir: Root directory to search for datasets use_file_refs: If True, use file references (type: file). If False, inline the full configs. - use_blended: If True, create blended datasets with weights proportional to token counts. - If False, create concatenated datasets. ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) Returns: - Dictionary representing the hierarchical dataset config + Dictionary representing the hierarchical blended dataset config """ logger.info(f"Discovering datasets in {root_dir}...") @@ -455,15 +426,14 @@ def create_hierarchical_config( tree = build_directory_tree(groups, root_dir) # Create hierarchical config - result = create_directory_config(root_dir, groups, tree, root_dir, use_file_refs, use_blended) + result = create_directory_config(root_dir, groups, tree, root_dir, use_file_refs) if result is None: raise ValueError("Failed to create config") config, total_tokens = result - if use_blended: - logger.info(f"Total tokens across all datasets: {total_tokens:,}") + logger.info(f"Total tokens across all datasets: {total_tokens:,}") return config @@ -477,10 +447,6 @@ class DiscoverDatasetsConfig(RunnableConfig): directory: pathlib.Path = Field(desc="Directory to search for datasets recursively") output: pathlib.Path = Field(desc="Output path for the generated config YAML file") use_file_refs: bool = Field(default=True, desc="Use file references (type: file) instead of inlining configs") - use_blended: bool = Field( - default=False, - desc="Create blended datasets with weights proportional to token counts instead of concatenated datasets", - ) ignore_paths: list[pathlib.Path] = Field( default_factory=list, desc="List of paths to ignore during dataset discovery (can be absolute or relative to directory)", @@ -501,7 +467,6 @@ def run(self): config = create_hierarchical_config( self.directory.resolve(), use_file_refs=self.use_file_refs, - use_blended=self.use_blended, ignore_paths=self.ignore_paths, ) @@ -526,19 +491,12 @@ def main(): """ Command-line entry point. """ - parser = argparse.ArgumentParser( - description="Discover datasets and generate hierarchical concatenated or blended config" - ) + parser = argparse.ArgumentParser(description="Discover datasets and generate hierarchical blended config") parser.add_argument("directory", type=pathlib.Path, help="Directory to search for datasets recursively") parser.add_argument( "-o", "--output", type=pathlib.Path, required=True, help="Output path for the generated config YAML file" ) parser.add_argument("--no-file-refs", action="store_true", help="Inline configs instead of using file references") - parser.add_argument( - "--blended", - action="store_true", - help="Create blended datasets with weights proportional to token counts instead of concatenated datasets", - ) parser.add_argument( "--ignore", type=pathlib.Path, @@ -552,16 +510,11 @@ def main(): # Configure logging logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - # Require --blended flag since concatenated datasets don't work properly - if not args.blended: - parser.error("--blended flag is required (concatenated datasets are currently not supported)") - # Create and run the config config = DiscoverDatasetsConfig( directory=args.directory, output=args.output, use_file_refs=not args.no_file_refs, - use_blended=args.blended, ignore_paths=args.ignore_paths or [], ) config.run() From 05d3bdd5747f9142f5eea174b32f15bf141adae8 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 22 Dec 2025 16:13:56 +0000 Subject: [PATCH 03/13] refactor --- tools/discover_datasets.py | 172 ++++++++++++++++++------------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py index ddc6e3093..776734299 100644 --- a/tools/discover_datasets.py +++ b/tools/discover_datasets.py @@ -8,6 +8,7 @@ import argparse import logging import pathlib +from collections import defaultdict import yaml @@ -17,6 +18,15 @@ logger = logging.getLogger(__name__) +def _is_subpath(path: pathlib.Path, parent: pathlib.Path) -> bool: + """Check if path is under parent directory.""" + try: + path.relative_to(parent) + return True + except ValueError: + return False + + def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path] | None = None) -> list[pathlib.Path]: """ Recursively find all fast_llm_config*.yaml files in the directory tree. @@ -28,8 +38,6 @@ def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path Returns: List of paths to fast_llm_config*.yaml files """ - config_files = [] - # Normalize ignore paths to absolute paths ignore_paths_absolute = set() if ignore_paths: @@ -39,29 +47,19 @@ def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path else: ignore_paths_absolute.add((root_dir / ignore_path).resolve()) - # Find all fast_llm_config*.yaml files + # Find all fast_llm_config*.yaml files and filter out ignored ones + config_files = [] for config_file in root_dir.rglob("fast_llm_config*.yaml"): - # Check if this file should be ignored config_file_resolved = config_file.resolve() - should_ignore = False - - for ignore_path in ignore_paths_absolute: - # Check if the config file is under an ignored path - try: - config_file_resolved.relative_to(ignore_path) - should_ignore = True - break - except ValueError: - # Not under this ignored path, continue checking - continue - - if not should_ignore: + + # Check if this file is under any ignored path + is_ignored = any(_is_subpath(config_file_resolved, ignore_path) for ignore_path in ignore_paths_absolute) + + if not is_ignored: config_files.append(config_file) # Sort by path for consistent ordering - config_files.sort() - - return config_files + return sorted(config_files) def load_dataset_config(config_path: pathlib.Path) -> dict: @@ -159,6 +157,49 @@ def get_dataset_num_tokens(config_path: pathlib.Path) -> int: return _get_config_num_tokens(config_dict, config_path.parent) +def _get_token_count(config_file: pathlib.Path) -> float: + """ + Get token count in billions for a dataset config file. + """ + num_tokens = get_dataset_num_tokens(config_file) + logger.info(f" - {config_file.name}: {num_tokens:,} tokens") + return num_tokens / 1e9 + + +def _create_dataset_reference(config_file: pathlib.Path, use_file_refs: bool) -> dict: + """ + Create a dataset reference or inline config. + + Args: + config_file: Path to the dataset config file + use_file_refs: If True, create a file reference; if False, inline the config + + Returns: + Dictionary representing the dataset + """ + if use_file_refs: + return {"type": "file", "path": str(config_file)} + else: + return load_dataset_config(config_file) + + +def _get_directory_name(directory: pathlib.Path, root_dir: pathlib.Path, suffix: str = "") -> str: + """ + Generate a name for a directory relative to root. + + Args: + directory: The directory to name + root_dir: The root directory + suffix: Optional suffix to append to the name + + Returns: + A string name for the directory + """ + rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") + base_name = str(rel_path).replace("/", "_").replace(".", root_dir.name) + return f"{base_name}{suffix}" if suffix else base_name + + def create_blended_config( config_files: list[pathlib.Path], name: str = "blended", @@ -189,32 +230,20 @@ def create_blended_config( else: return load_dataset_config(config_files[0]) - # Multiple datasets + # Build datasets and weights in a single pass + logger.info("Calculating token counts for blended dataset weights...") datasets = [] + weights = [] + for config_file in config_files: + # Add dataset reference or inline config if use_file_refs: - datasets.append( - { - "type": "file", - "path": str(config_file), - } - ) + datasets.append({"type": "file", "path": str(config_file)}) else: datasets.append(load_dataset_config(config_file)) - # Get token counts for each dataset to calculate weights - logger.info("Calculating token counts for blended dataset weights...") - weights = [] - for config_file in config_files: - try: - num_tokens = get_dataset_num_tokens(config_file) - weights.append(num_tokens / 1e9) - logger.info(f" - {config_file.name}: {num_tokens:,} tokens") - except Exception as e: - logger.error(f"Failed to get token count for {config_file}: {e}") - # Use weight of 1 as fallback - weights.append(1) - logger.warning(f" - {config_file.name}: using fallback weight of 1") + # Get token count for weight + weights.append(_get_token_count(config_file)) return { "type": "blended", @@ -237,15 +266,11 @@ def group_configs_by_directory( Returns: Dictionary mapping directory paths to lists of config files in that directory """ - groups: dict[pathlib.Path, list[pathlib.Path]] = {} - + groups: dict[pathlib.Path, list[pathlib.Path]] = defaultdict(list) for config_file in config_files: - parent_dir = config_file.parent - if parent_dir not in groups: - groups[parent_dir] = [] - groups[parent_dir].append(config_file) + groups[config_file.parent].append(config_file) - return groups + return dict(groups) def build_directory_tree( @@ -284,7 +309,7 @@ def create_directory_config( tree: dict[pathlib.Path, set[pathlib.Path]], root_dir: pathlib.Path, use_file_refs: bool, -) -> tuple[dict, int] | None: +) -> tuple[dict, float] | None: """ Recursively create a blended config for a directory and its subdirectories. @@ -296,37 +321,20 @@ def create_directory_config( use_file_refs: Whether to use file references Returns: - Tuple of (config dictionary, total token count), or None if directory has no datasets + Tuple of (config dictionary, total token count in billions), or None if directory has no datasets """ local_datasets = [] - local_config_files = [] local_tokens = [] - subdir_datasets = [] - subdir_tokens = [] - # First, collect configs directly in this directory (not in subdirectories) + # Collect configs directly in this directory (not in subdirectories) if directory in groups: for config_file in sorted(groups[directory]): - if use_file_refs: - local_datasets.append( - { - "type": "file", - "path": str(config_file), - } - ) - else: - local_datasets.append(load_dataset_config(config_file)) - local_config_files.append(config_file) - - # Get token count for this dataset - try: - num_tokens = get_dataset_num_tokens(config_file) - local_tokens.append(num_tokens / 1e9) - except Exception as e: - logger.warning(f"Failed to get token count for {config_file}: {e}") - local_tokens.append(1) - - # Then, recursively process subdirectories + local_datasets.append(_create_dataset_reference(config_file, use_file_refs)) + local_tokens.append(_get_token_count(config_file)) + + # Recursively process subdirectories + subdir_datasets = [] + subdir_tokens = [] if directory in tree: for subdir in sorted(tree[directory]): subdir_result = create_directory_config(subdir, groups, tree, root_dir, use_file_refs) @@ -335,17 +343,14 @@ def create_directory_config( subdir_datasets.append(subdir_config) subdir_tokens.append(subdir_token_count) - # If we have both local datasets and subdirectory datasets, group local ones first + # Combine local and subdirectory datasets if local_datasets and subdir_datasets: - # Create a group for local datasets if there are multiple + # If multiple local datasets, group them together if len(local_datasets) > 1: - rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") - local_name = f"{str(rel_path).replace('/', '_').replace('.', root_dir.name)}_local" local_total_tokens = sum(local_tokens) - local_group = { "type": "blended", - "name": local_name, + "name": _get_directory_name(directory, root_dir, "_local"), "datasets": local_datasets, "weights": local_tokens, } @@ -363,21 +368,16 @@ def create_directory_config( else: return None - # Calculate total tokens for this directory total_tokens = sum(all_tokens) + # Don't wrap a single dataset if len(all_datasets) == 1: - # Don't wrap a single dataset return all_datasets[0], total_tokens # Multiple datasets - create blended config - rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") - name = str(rel_path).replace("/", "_").replace(".", root_dir.name) - - # Use the collected token counts as weights return { "type": "blended", - "name": name, + "name": _get_directory_name(directory, root_dir), "datasets": all_datasets, "weights": all_tokens, }, total_tokens From 2fd99d87a74b9c52a6ab7339bacdba2e6298371e Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 22 Dec 2025 19:26:46 +0000 Subject: [PATCH 04/13] add comment in yaml file --- tools/discover_datasets.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py index 776734299..686f562d5 100644 --- a/tools/discover_datasets.py +++ b/tools/discover_datasets.py @@ -63,15 +63,6 @@ def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path def load_dataset_config(config_path: pathlib.Path) -> dict: - """ - Load a dataset config from a YAML file. - - Args: - config_path: Path to the config file - - Returns: - The loaded config as a dictionary - """ with open(config_path) as f: config = yaml.safe_load(f) return config @@ -253,15 +244,12 @@ def create_blended_config( } -def group_configs_by_directory( - config_files: list[pathlib.Path], root_dir: pathlib.Path -) -> dict[pathlib.Path, list[pathlib.Path]]: +def group_configs_by_directory(config_files: list[pathlib.Path]) -> dict[pathlib.Path, list[pathlib.Path]]: """ Group config files by their parent directory. Args: config_files: List of config file paths - root_dir: Root directory to use for relative paths Returns: Dictionary mapping directory paths to lists of config files in that directory @@ -420,7 +408,7 @@ def create_hierarchical_config( logger.info(f" - {config_file.relative_to(root_dir)}") # Group configs by directory - groups = group_configs_by_directory(config_files, root_dir) + groups = group_configs_by_directory(config_files) # Build directory tree tree = build_directory_tree(groups, root_dir) @@ -470,9 +458,22 @@ def run(self): ignore_paths=self.ignore_paths, ) - # Write the config to the output file + # Write the config to the output file with header comment self.output.parent.mkdir(parents=True, exist_ok=True) with open(self.output, "w") as f: + # Write header comment + f.write( + "# This file was generated with tools/discover_datasets.py; weights are token-counts in billions.\n" + ) + f.write(f"# Configuration:\n") + f.write(f"# directory: {self.directory}\n") + f.write(f"# use_file_refs: {self.use_file_refs}\n") + if self.ignore_paths: + f.write(f"# ignore_paths:\n") + for ignore_path in self.ignore_paths: + f.write(f"# - {ignore_path}\n") + f.write("\n") + # Write the YAML config yaml.safe_dump(config, f, default_flow_style=False, sort_keys=False) logger.info(f"Generated dataset config saved to {self.output}") From b4c5034a78075a664e869116a4b85568408cda3b Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Mon, 22 Dec 2025 19:50:59 +0000 Subject: [PATCH 05/13] update readme --- tools/README_discover_datasets.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/README_discover_datasets.md b/tools/README_discover_datasets.md index 2959dc33d..af0b21819 100644 --- a/tools/README_discover_datasets.md +++ b/tools/README_discover_datasets.md @@ -24,7 +24,7 @@ python tools/discover_datasets.py -o [options] - `directory`: Directory to search for datasets recursively (required) - `-o, --output`: Output path for the generated config YAML file (required) -- `--no-file-refs`: Inline configs instead of using file references (optional) +- `--no-file-refs`: Inline configs instead of using file references (optional, not recommended) - `--ignore`: Path to ignore during dataset discovery (can be specified multiple times, optional) **Examples:** @@ -86,9 +86,9 @@ datasets: - type: file path: /path/to/dataset2/fast_llm_config.yaml weights: - - 1500000 # Dataset 1 has 1.5M tokens - - 500000 # Dataset 2 has 500K tokens - - 2000000 # Dataset 3 has 2M tokens + - 1.5 # Dataset 1 has 1.5B tokens + - 0.5 # Dataset 2 has 0.5B tokens + - 2.0 # Dataset 3 has 2.0B tokens ``` With blended datasets, samples are drawn from each dataset proportionally to their weights during training. This means: From 8b00983e5e51ca9833fe4f6bf0f91b4308756399 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 18:52:07 +0000 Subject: [PATCH 06/13] move to data preparator --- .../preparator/dataset_discovery/__init__.py | 4 + .../preparator/dataset_discovery/config.py | 51 ++ .../preparator/dataset_discovery/prepare.py | 473 +++++++++++++++++ tools/discover_datasets.py | 486 +----------------- 4 files changed, 533 insertions(+), 481 deletions(-) create mode 100644 fast_llm/data/preparator/dataset_discovery/__init__.py create mode 100644 fast_llm/data/preparator/dataset_discovery/config.py create mode 100644 fast_llm/data/preparator/dataset_discovery/prepare.py diff --git a/fast_llm/data/preparator/dataset_discovery/__init__.py b/fast_llm/data/preparator/dataset_discovery/__init__.py new file mode 100644 index 000000000..a9d38880a --- /dev/null +++ b/fast_llm/data/preparator/dataset_discovery/__init__.py @@ -0,0 +1,4 @@ +from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig +from fast_llm.data.preparator.dataset_discovery.prepare import DatasetDiscoveryPreparator + +__all__ = ["DatasetDiscoveryConfig", "DatasetDiscoveryPreparator"] diff --git a/fast_llm/data/preparator/dataset_discovery/config.py b/fast_llm/data/preparator/dataset_discovery/config.py new file mode 100644 index 000000000..d6765d45e --- /dev/null +++ b/fast_llm/data/preparator/dataset_discovery/config.py @@ -0,0 +1,51 @@ +import pathlib +import typing + +from fast_llm.config import Field, FieldHint, config_class +from fast_llm.data.preparator.config import DatasetPreparatorConfig +from fast_llm.engine.config_utils.runnable import RunnableConfig + +if typing.TYPE_CHECKING: + from fast_llm.data.preparator.dataset_discovery.prepare import DatasetDiscoveryPreparator + + +@config_class(dynamic_type={RunnableConfig: "prepare_dataset_discovery", DatasetPreparatorConfig: "dataset_discovery"}) +class DatasetDiscoveryConfig(DatasetPreparatorConfig): + """ + Configuration for the dataset discovery preparator. + + This preparator recursively discovers datasets in a directory and generates + a blended dataset config with weights proportional to token counts. + """ + + directory: pathlib.Path = Field( + desc="Directory to search for datasets recursively", + hint=FieldHint.core, + ) + output: pathlib.Path = Field( + desc="Output path for the generated config YAML file", + hint=FieldHint.core, + ) + use_file_refs: bool = Field( + default=True, + desc="Use file references (type: file) instead of inlining configs", + hint=FieldHint.optional, + ) + ignore_paths: list[pathlib.Path] = Field( + default_factory=list, + desc="List of paths to ignore during dataset discovery (can be absolute or relative to directory)", + hint=FieldHint.optional, + ) + + def _validate(self) -> None: + super()._validate() + if not self.directory.exists(): + raise ValueError(f"Directory does not exist: {self.directory}") + if not self.directory.is_dir(): + raise ValueError(f"Path is not a directory: {self.directory}") + + @classmethod + def get_dataset_preparator_class(cls) -> type["DatasetDiscoveryPreparator"]: + from fast_llm.data.preparator.dataset_discovery.prepare import DatasetDiscoveryPreparator + + return DatasetDiscoveryPreparator diff --git a/fast_llm/data/preparator/dataset_discovery/prepare.py b/fast_llm/data/preparator/dataset_discovery/prepare.py new file mode 100644 index 000000000..f054524a9 --- /dev/null +++ b/fast_llm/data/preparator/dataset_discovery/prepare.py @@ -0,0 +1,473 @@ +""" +Dataset discovery preparator. + +This module provides functionality to recursively discover datasets in a directory +and generate a blended dataset config with weights proportional to token counts. +""" + +import json +import logging +import pathlib +from collections import defaultdict + +import yaml + +from fast_llm.data.preparator.config import DatasetPreparator +from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig + +logger = logging.getLogger(__name__) + + +class DatasetDiscoveryPreparator[ConfigType: DatasetDiscoveryConfig](DatasetPreparator[ConfigType]): + """ + Preparator for discovering datasets in a directory tree and generating blended configs. + """ + + _config: DatasetDiscoveryConfig + + def run(self) -> None: + """ + Run the dataset discovery preparator. + """ + # Generate the hierarchical config + config = self._create_hierarchical_config( + self._config.directory.resolve(), + use_file_refs=self._config.use_file_refs, + ignore_paths=self._config.ignore_paths, + ) + + # Write the config to the output file with header comment + self._config.output.parent.mkdir(parents=True, exist_ok=True) + with open(self._config.output, "w") as f: + # Write header comment + f.write( + "# This file was generated with fast_llm.data.preparator.dataset_discovery; " + "weights are token-counts in billions.\n" + ) + f.write(f"# Configuration:\n") + f.write(f"# directory: {self._config.directory}\n") + f.write(f"# use_file_refs: {self._config.use_file_refs}\n") + if self._config.ignore_paths: + f.write(f"# ignore_paths:\n") + for ignore_path in self._config.ignore_paths: + f.write(f"# - {ignore_path}\n") + f.write("\n") + # Write the YAML config + yaml.safe_dump(config, f, default_flow_style=False, sort_keys=False) + + logger.info(f"Generated dataset config saved to {self._config.output}") + + # Print a preview of the config + logger.info("\nGenerated config preview:") + preview = yaml.safe_dump(config, default_flow_style=False, sort_keys=False) + for line in preview.split("\n")[:50]: # Show first 50 lines + logger.info(line) + + if len(preview.split("\n")) > 50: + logger.info("... (truncated)") + + @staticmethod + def _is_subpath(path: pathlib.Path, parent: pathlib.Path) -> bool: + """Check if path is under parent directory.""" + try: + path.relative_to(parent) + return True + except ValueError: + return False + + def _find_dataset_configs( + self, root_dir: pathlib.Path, ignore_paths: list[pathlib.Path] | None = None + ) -> list[pathlib.Path]: + """ + Recursively find all fast_llm_config*.yaml files in the directory tree. + + Args: + root_dir: Root directory to search + ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) + + Returns: + List of paths to fast_llm_config*.yaml files + """ + # Normalize ignore paths to absolute paths + ignore_paths_absolute = set() + if ignore_paths: + for ignore_path in ignore_paths: + if ignore_path.is_absolute(): + ignore_paths_absolute.add(ignore_path.resolve()) + else: + ignore_paths_absolute.add((root_dir / ignore_path).resolve()) + + # Find all fast_llm_config*.yaml files and filter out ignored ones + config_files = [] + for config_file in root_dir.rglob("fast_llm_config*.yaml"): + config_file_resolved = config_file.resolve() + + # Check if this file is under any ignored path + is_ignored = any( + self._is_subpath(config_file_resolved, ignore_path) for ignore_path in ignore_paths_absolute + ) + + if not is_ignored: + config_files.append(config_file) + + # Sort by path for consistent ordering + return sorted(config_files) + + @staticmethod + def _load_dataset_config(config_path: pathlib.Path) -> dict: + """Load a dataset config from a YAML file.""" + with open(config_path) as f: + config = yaml.safe_load(f) + return config + + @staticmethod + def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: + """Read number of tokens from a memmap file.""" + from fast_llm.data.dataset.memmap import FILE_HEADER + from fast_llm.data.sample.abstract import MemmapIndexDatasetReaderConfig + + if not memmap_path.exists(): + logger.warning(f"Memmap file not found: {memmap_path}") + return 0 + + try: + with memmap_path.open("rb") as stream: + header = stream.read(len(FILE_HEADER)) + if header != FILE_HEADER: + logger.warning(f"Invalid memmap file format: {memmap_path}") + return 0 + stream.seek(int.from_bytes(stream.read(8), signed=False)) + config_bytes = stream.read(int.from_bytes(stream.read(4), signed=False)) + reader_config = MemmapIndexDatasetReaderConfig.from_dict(json.loads(config_bytes.decode("utf-8"))) + return reader_config.num_tokens + except Exception as e: + logger.warning(f"Failed to read memmap file {memmap_path}: {e}") + return 0 + + @staticmethod + def _resolve_path(path: str | pathlib.Path, relative_to: pathlib.Path) -> pathlib.Path: + """Resolve a path relative to a base directory if not absolute.""" + path = pathlib.Path(path) + return path if path.is_absolute() else relative_to / path + + def _get_config_num_tokens(self, config_dict: dict, base_dir: pathlib.Path) -> int: + """Get number of tokens from a config dict (handles inline configs recursively).""" + dataset_type = config_dict.get("type") + + if dataset_type == "file": + file_path = self._resolve_path(config_dict["path"], base_dir) + return self._get_dataset_num_tokens(file_path) + + if dataset_type == "memmap": + memmap_path = self._resolve_path(config_dict.get("path", ""), base_dir) + return self._read_memmap_num_tokens(memmap_path) + + if dataset_type in ["blended", "sampled", "concatenated"]: + return sum(self._get_config_num_tokens(sub, base_dir) for sub in config_dict.get("datasets", [])) + + if dataset_type == "slice": + base_config = config_dict.get("dataset", {}) + begin = config_dict.get("begin", 0) + end = config_dict.get("end", 1) + base_tokens = self._get_config_num_tokens(base_config, base_dir) + return int(base_tokens * (end - begin)) + + logger.warning(f"Unsupported inline config type '{dataset_type}'") + return 0 + + def _get_dataset_num_tokens(self, config_path: pathlib.Path) -> int: + """ + Load a dataset config and get its number of tokens. + + Args: + config_path: Path to the dataset config file + + Returns: + Number of tokens in the dataset + """ + # Import preprocessing and sample configs to register them + import fast_llm.data.preprocessing.image_patch # noqa + import fast_llm.data.preprocessing.language_model # noqa + import fast_llm.data.sample.language_model # noqa + import fast_llm.data.sample.patch # noqa + import fast_llm.data.sample.range # noqa + import fast_llm.data.sample.token # noqa + + config_dict = self._load_dataset_config(config_path) + return self._get_config_num_tokens(config_dict, config_path.parent) + + def _get_token_count(self, config_file: pathlib.Path) -> float: + """ + Get token count in billions for a dataset config file. + """ + num_tokens = self._get_dataset_num_tokens(config_file) + logger.info(f" - {config_file.name}: {num_tokens:,} tokens") + return num_tokens / 1e9 + + def _create_dataset_reference(self, config_file: pathlib.Path, use_file_refs: bool) -> dict: + """ + Create a dataset reference or inline config. + + Args: + config_file: Path to the dataset config file + use_file_refs: If True, create a file reference; if False, inline the config + + Returns: + Dictionary representing the dataset + """ + if use_file_refs: + return {"type": "file", "path": str(config_file)} + else: + return self._load_dataset_config(config_file) + + @staticmethod + def _get_directory_name(directory: pathlib.Path, root_dir: pathlib.Path, suffix: str = "") -> str: + """ + Generate a name for a directory relative to root. + + Args: + directory: The directory to name + root_dir: The root directory + suffix: Optional suffix to append to the name + + Returns: + A string name for the directory + """ + rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") + base_name = str(rel_path).replace("/", "_").replace(".", root_dir.name) + return f"{base_name}{suffix}" if suffix else base_name + + def _create_blended_config( + self, + config_files: list[pathlib.Path], + name: str = "blended", + use_file_refs: bool = True, + ) -> dict: + """ + Create a blended dataset config from a list of config files. + + Args: + config_files: List of paths to dataset config files + name: Name for the blended dataset + use_file_refs: If True, use file references (type: file, path: ...). + If False, inline the full configs. + + Returns: + Dictionary representing a blended dataset config + """ + if len(config_files) == 0: + raise ValueError("No config files provided") + + if len(config_files) == 1: + # If only one dataset, just reference it directly + if use_file_refs: + return { + "type": "file", + "path": str(config_files[0]), + } + else: + return self._load_dataset_config(config_files[0]) + + # Build datasets and weights in a single pass + logger.info("Calculating token counts for blended dataset weights...") + datasets = [] + weights = [] + + for config_file in config_files: + # Add dataset reference or inline config + if use_file_refs: + datasets.append({"type": "file", "path": str(config_file)}) + else: + datasets.append(self._load_dataset_config(config_file)) + + # Get token count for weight + weights.append(self._get_token_count(config_file)) + + return { + "type": "blended", + "name": name, + "datasets": datasets, + "weights": weights, + } + + @staticmethod + def _group_configs_by_directory(config_files: list[pathlib.Path]) -> dict[pathlib.Path, list[pathlib.Path]]: + """ + Group config files by their parent directory. + + Args: + config_files: List of config file paths + + Returns: + Dictionary mapping directory paths to lists of config files in that directory + """ + groups: dict[pathlib.Path, list[pathlib.Path]] = defaultdict(list) + for config_file in config_files: + groups[config_file.parent].append(config_file) + + return dict(groups) + + @staticmethod + def _build_directory_tree( + groups: dict[pathlib.Path, list[pathlib.Path]], root_dir: pathlib.Path + ) -> dict[pathlib.Path, set[pathlib.Path]]: + """ + Build a tree structure of directories showing parent-child relationships. + + Args: + groups: Dictionary mapping directories to their config files + root_dir: Root directory + + Returns: + Dictionary mapping each directory to its immediate child directories + """ + tree: dict[pathlib.Path, set[pathlib.Path]] = {root_dir: set()} + + for directory in groups.keys(): + # Add all ancestors to the tree + current = directory + while current != root_dir and current.parent != current: + parent = current.parent + if parent not in tree: + tree[parent] = set() + if current not in tree: + tree[current] = set() + tree[parent].add(current) + current = parent + + return tree + + def _create_directory_config( + self, + directory: pathlib.Path, + groups: dict[pathlib.Path, list[pathlib.Path]], + tree: dict[pathlib.Path, set[pathlib.Path]], + root_dir: pathlib.Path, + use_file_refs: bool, + ) -> tuple[dict, float] | None: + """ + Recursively create a blended config for a directory and its subdirectories. + + Args: + directory: Current directory to process + groups: Dictionary mapping directories to their config files + tree: Directory tree structure + root_dir: Root directory + use_file_refs: Whether to use file references + + Returns: + Tuple of (config dictionary, total token count in billions), or None if directory has no datasets + """ + local_datasets = [] + local_tokens = [] + + # Collect configs directly in this directory (not in subdirectories) + if directory in groups: + for config_file in sorted(groups[directory]): + local_datasets.append(self._create_dataset_reference(config_file, use_file_refs)) + local_tokens.append(self._get_token_count(config_file)) + + # Recursively process subdirectories + subdir_datasets = [] + subdir_tokens = [] + if directory in tree: + for subdir in sorted(tree[directory]): + subdir_result = self._create_directory_config(subdir, groups, tree, root_dir, use_file_refs) + if subdir_result is not None: + subdir_config, subdir_token_count = subdir_result + subdir_datasets.append(subdir_config) + subdir_tokens.append(subdir_token_count) + + # Combine local and subdirectory datasets + if local_datasets and subdir_datasets: + # If multiple local datasets, group them together + if len(local_datasets) > 1: + local_total_tokens = sum(local_tokens) + local_group = { + "type": "blended", + "name": self._get_directory_name(directory, root_dir, "_local"), + "datasets": local_datasets, + "weights": local_tokens, + } + all_datasets = [local_group] + subdir_datasets + all_tokens = [local_total_tokens] + subdir_tokens + else: + all_datasets = local_datasets + subdir_datasets + all_tokens = local_tokens + subdir_tokens + elif local_datasets: + all_datasets = local_datasets + all_tokens = local_tokens + elif subdir_datasets: + all_datasets = subdir_datasets + all_tokens = subdir_tokens + else: + return None + + total_tokens = sum(all_tokens) + + # Don't wrap a single dataset + if len(all_datasets) == 1: + return all_datasets[0], total_tokens + + # Multiple datasets - create blended config + return { + "type": "blended", + "name": self._get_directory_name(directory, root_dir), + "datasets": all_datasets, + "weights": all_tokens, + }, total_tokens + + def _create_hierarchical_config( + self, + root_dir: pathlib.Path, + use_file_refs: bool = True, + ignore_paths: list[pathlib.Path] | None = None, + ) -> dict: + """ + Create a hierarchical blended dataset config from all datasets in a directory. + + Datasets in the same directory are grouped together with weights proportional to token counts, + and these groups are nested following the directory structure. + + Args: + root_dir: Root directory to search for datasets + use_file_refs: If True, use file references (type: file). + If False, inline the full configs. + ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) + + Returns: + Dictionary representing the hierarchical blended dataset config + """ + logger.info(f"Discovering datasets in {root_dir}...") + + if ignore_paths: + logger.info(f"Ignoring {len(ignore_paths)} path(s):") + for ignore_path in ignore_paths: + logger.info(f" - {ignore_path}") + + config_files = self._find_dataset_configs(root_dir, ignore_paths=ignore_paths) + + if not config_files: + raise ValueError(f"No fast_llm_config*.yaml files found in {root_dir}") + + logger.info(f"Found {len(config_files)} dataset config(s):") + for config_file in config_files: + logger.info(f" - {config_file.relative_to(root_dir)}") + + # Group configs by directory + groups = self._group_configs_by_directory(config_files) + + # Build directory tree + tree = self._build_directory_tree(groups, root_dir) + + # Create hierarchical config + result = self._create_directory_config(root_dir, groups, tree, root_dir, use_file_refs) + + if result is None: + raise ValueError("Failed to create config") + + config, total_tokens = result + + logger.info(f"Total tokens across all datasets: {total_tokens:,}") + + return config diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py index 686f562d5..0744ec10c 100644 --- a/tools/discover_datasets.py +++ b/tools/discover_datasets.py @@ -1,491 +1,15 @@ """ Tool to recursively discover datasets in a directory and generate a blended dataset config. -This tool walks through a directory tree, identifies datasets by their fast_llm_config*.yaml files, -and generates a config file that blends all discovered datasets with weights proportional to token counts. +This tool is a command-line wrapper around the DatasetDiscoveryPreparator. +For programmatic usage, use fast_llm.data.preparator.dataset_discovery directly. """ import argparse import logging import pathlib -from collections import defaultdict -import yaml - -from fast_llm.config import Field, config_class -from fast_llm.engine.config_utils.runnable import RunnableConfig - -logger = logging.getLogger(__name__) - - -def _is_subpath(path: pathlib.Path, parent: pathlib.Path) -> bool: - """Check if path is under parent directory.""" - try: - path.relative_to(parent) - return True - except ValueError: - return False - - -def find_dataset_configs(root_dir: pathlib.Path, ignore_paths: list[pathlib.Path] | None = None) -> list[pathlib.Path]: - """ - Recursively find all fast_llm_config*.yaml files in the directory tree. - - Args: - root_dir: Root directory to search - ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) - - Returns: - List of paths to fast_llm_config*.yaml files - """ - # Normalize ignore paths to absolute paths - ignore_paths_absolute = set() - if ignore_paths: - for ignore_path in ignore_paths: - if ignore_path.is_absolute(): - ignore_paths_absolute.add(ignore_path.resolve()) - else: - ignore_paths_absolute.add((root_dir / ignore_path).resolve()) - - # Find all fast_llm_config*.yaml files and filter out ignored ones - config_files = [] - for config_file in root_dir.rglob("fast_llm_config*.yaml"): - config_file_resolved = config_file.resolve() - - # Check if this file is under any ignored path - is_ignored = any(_is_subpath(config_file_resolved, ignore_path) for ignore_path in ignore_paths_absolute) - - if not is_ignored: - config_files.append(config_file) - - # Sort by path for consistent ordering - return sorted(config_files) - - -def load_dataset_config(config_path: pathlib.Path) -> dict: - with open(config_path) as f: - config = yaml.safe_load(f) - return config - - -def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: - """Read number of tokens from a memmap file.""" - import json - - from fast_llm.data.dataset.memmap import FILE_HEADER - from fast_llm.data.sample.abstract import MemmapIndexDatasetReaderConfig - - if not memmap_path.exists(): - logger.warning(f"Memmap file not found: {memmap_path}") - return 0 - - try: - with memmap_path.open("rb") as stream: - header = stream.read(len(FILE_HEADER)) - if header != FILE_HEADER: - logger.warning(f"Invalid memmap file format: {memmap_path}") - return 0 - stream.seek(int.from_bytes(stream.read(8), signed=False)) - config_bytes = stream.read(int.from_bytes(stream.read(4), signed=False)) - reader_config = MemmapIndexDatasetReaderConfig.from_dict(json.loads(config_bytes.decode("utf-8"))) - return reader_config.num_tokens - except Exception as e: - logger.warning(f"Failed to read memmap file {memmap_path}: {e}") - return 0 - - -def _resolve_path(path: str | pathlib.Path, relative_to: pathlib.Path) -> pathlib.Path: - """Resolve a path relative to a base directory if not absolute.""" - path = pathlib.Path(path) - return path if path.is_absolute() else relative_to / path - - -def _get_config_num_tokens(config_dict: dict, base_dir: pathlib.Path) -> int: - """Get number of tokens from a config dict (handles inline configs recursively).""" - dataset_type = config_dict.get("type") - - if dataset_type == "file": - file_path = _resolve_path(config_dict["path"], base_dir) - return get_dataset_num_tokens(file_path) - - if dataset_type == "memmap": - memmap_path = _resolve_path(config_dict.get("path", ""), base_dir) - return _read_memmap_num_tokens(memmap_path) - - if dataset_type in ["blended", "sampled", "concatenated"]: - return sum(_get_config_num_tokens(sub, base_dir) for sub in config_dict.get("datasets", [])) - - if dataset_type == "slice": - base_config = config_dict.get("dataset", {}) - begin = config_dict.get("begin", 0) - end = config_dict.get("end", 1) - base_tokens = _get_config_num_tokens(base_config, base_dir) - return int(base_tokens * (end - begin)) - - logger.warning(f"Unsupported inline config type '{dataset_type}'") - return 0 - - -def get_dataset_num_tokens(config_path: pathlib.Path) -> int: - """ - Load a dataset config and get its number of tokens. - - Args: - config_path: Path to the dataset config file - - Returns: - Number of tokens in the dataset - """ - # Import preprocessing and sample configs to register them - import fast_llm.data.preprocessing.image_patch # noqa - import fast_llm.data.preprocessing.language_model # noqa - import fast_llm.data.sample.language_model # noqa - import fast_llm.data.sample.patch # noqa - import fast_llm.data.sample.range # noqa - import fast_llm.data.sample.token # noqa - - config_dict = load_dataset_config(config_path) - return _get_config_num_tokens(config_dict, config_path.parent) - - -def _get_token_count(config_file: pathlib.Path) -> float: - """ - Get token count in billions for a dataset config file. - """ - num_tokens = get_dataset_num_tokens(config_file) - logger.info(f" - {config_file.name}: {num_tokens:,} tokens") - return num_tokens / 1e9 - - -def _create_dataset_reference(config_file: pathlib.Path, use_file_refs: bool) -> dict: - """ - Create a dataset reference or inline config. - - Args: - config_file: Path to the dataset config file - use_file_refs: If True, create a file reference; if False, inline the config - - Returns: - Dictionary representing the dataset - """ - if use_file_refs: - return {"type": "file", "path": str(config_file)} - else: - return load_dataset_config(config_file) - - -def _get_directory_name(directory: pathlib.Path, root_dir: pathlib.Path, suffix: str = "") -> str: - """ - Generate a name for a directory relative to root. - - Args: - directory: The directory to name - root_dir: The root directory - suffix: Optional suffix to append to the name - - Returns: - A string name for the directory - """ - rel_path = directory.relative_to(root_dir) if directory != root_dir else pathlib.Path(".") - base_name = str(rel_path).replace("/", "_").replace(".", root_dir.name) - return f"{base_name}{suffix}" if suffix else base_name - - -def create_blended_config( - config_files: list[pathlib.Path], - name: str = "blended", - use_file_refs: bool = True, -) -> dict: - """ - Create a blended dataset config from a list of config files. - - Args: - config_files: List of paths to dataset config files - name: Name for the blended dataset - use_file_refs: If True, use file references (type: file, path: ...). - If False, inline the full configs. - - Returns: - Dictionary representing a blended dataset config - """ - if len(config_files) == 0: - raise ValueError("No config files provided") - - if len(config_files) == 1: - # If only one dataset, just reference it directly - if use_file_refs: - return { - "type": "file", - "path": str(config_files[0]), - } - else: - return load_dataset_config(config_files[0]) - - # Build datasets and weights in a single pass - logger.info("Calculating token counts for blended dataset weights...") - datasets = [] - weights = [] - - for config_file in config_files: - # Add dataset reference or inline config - if use_file_refs: - datasets.append({"type": "file", "path": str(config_file)}) - else: - datasets.append(load_dataset_config(config_file)) - - # Get token count for weight - weights.append(_get_token_count(config_file)) - - return { - "type": "blended", - "name": name, - "datasets": datasets, - "weights": weights, - } - - -def group_configs_by_directory(config_files: list[pathlib.Path]) -> dict[pathlib.Path, list[pathlib.Path]]: - """ - Group config files by their parent directory. - - Args: - config_files: List of config file paths - - Returns: - Dictionary mapping directory paths to lists of config files in that directory - """ - groups: dict[pathlib.Path, list[pathlib.Path]] = defaultdict(list) - for config_file in config_files: - groups[config_file.parent].append(config_file) - - return dict(groups) - - -def build_directory_tree( - groups: dict[pathlib.Path, list[pathlib.Path]], root_dir: pathlib.Path -) -> dict[pathlib.Path, set[pathlib.Path]]: - """ - Build a tree structure of directories showing parent-child relationships. - - Args: - groups: Dictionary mapping directories to their config files - root_dir: Root directory - - Returns: - Dictionary mapping each directory to its immediate child directories - """ - tree: dict[pathlib.Path, set[pathlib.Path]] = {root_dir: set()} - - for directory in groups.keys(): - # Add all ancestors to the tree - current = directory - while current != root_dir and current.parent != current: - parent = current.parent - if parent not in tree: - tree[parent] = set() - if current not in tree: - tree[current] = set() - tree[parent].add(current) - current = parent - - return tree - - -def create_directory_config( - directory: pathlib.Path, - groups: dict[pathlib.Path, list[pathlib.Path]], - tree: dict[pathlib.Path, set[pathlib.Path]], - root_dir: pathlib.Path, - use_file_refs: bool, -) -> tuple[dict, float] | None: - """ - Recursively create a blended config for a directory and its subdirectories. - - Args: - directory: Current directory to process - groups: Dictionary mapping directories to their config files - tree: Directory tree structure - root_dir: Root directory - use_file_refs: Whether to use file references - - Returns: - Tuple of (config dictionary, total token count in billions), or None if directory has no datasets - """ - local_datasets = [] - local_tokens = [] - - # Collect configs directly in this directory (not in subdirectories) - if directory in groups: - for config_file in sorted(groups[directory]): - local_datasets.append(_create_dataset_reference(config_file, use_file_refs)) - local_tokens.append(_get_token_count(config_file)) - - # Recursively process subdirectories - subdir_datasets = [] - subdir_tokens = [] - if directory in tree: - for subdir in sorted(tree[directory]): - subdir_result = create_directory_config(subdir, groups, tree, root_dir, use_file_refs) - if subdir_result is not None: - subdir_config, subdir_token_count = subdir_result - subdir_datasets.append(subdir_config) - subdir_tokens.append(subdir_token_count) - - # Combine local and subdirectory datasets - if local_datasets and subdir_datasets: - # If multiple local datasets, group them together - if len(local_datasets) > 1: - local_total_tokens = sum(local_tokens) - local_group = { - "type": "blended", - "name": _get_directory_name(directory, root_dir, "_local"), - "datasets": local_datasets, - "weights": local_tokens, - } - all_datasets = [local_group] + subdir_datasets - all_tokens = [local_total_tokens] + subdir_tokens - else: - all_datasets = local_datasets + subdir_datasets - all_tokens = local_tokens + subdir_tokens - elif local_datasets: - all_datasets = local_datasets - all_tokens = local_tokens - elif subdir_datasets: - all_datasets = subdir_datasets - all_tokens = subdir_tokens - else: - return None - - total_tokens = sum(all_tokens) - - # Don't wrap a single dataset - if len(all_datasets) == 1: - return all_datasets[0], total_tokens - - # Multiple datasets - create blended config - return { - "type": "blended", - "name": _get_directory_name(directory, root_dir), - "datasets": all_datasets, - "weights": all_tokens, - }, total_tokens - - -def create_hierarchical_config( - root_dir: pathlib.Path, - use_file_refs: bool = True, - ignore_paths: list[pathlib.Path] | None = None, -) -> dict: - """ - Create a hierarchical blended dataset config from all datasets in a directory. - - Datasets in the same directory are grouped together with weights proportional to token counts, - and these groups are nested following the directory structure. - - Args: - root_dir: Root directory to search for datasets - use_file_refs: If True, use file references (type: file). - If False, inline the full configs. - ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) - - Returns: - Dictionary representing the hierarchical blended dataset config - """ - logger.info(f"Discovering datasets in {root_dir}...") - - if ignore_paths: - logger.info(f"Ignoring {len(ignore_paths)} path(s):") - for ignore_path in ignore_paths: - logger.info(f" - {ignore_path}") - - config_files = find_dataset_configs(root_dir, ignore_paths=ignore_paths) - - if not config_files: - raise ValueError(f"No fast_llm_config*.yaml files found in {root_dir}") - - logger.info(f"Found {len(config_files)} dataset config(s):") - for config_file in config_files: - logger.info(f" - {config_file.relative_to(root_dir)}") - - # Group configs by directory - groups = group_configs_by_directory(config_files) - - # Build directory tree - tree = build_directory_tree(groups, root_dir) - - # Create hierarchical config - result = create_directory_config(root_dir, groups, tree, root_dir, use_file_refs) - - if result is None: - raise ValueError("Failed to create config") - - config, total_tokens = result - - logger.info(f"Total tokens across all datasets: {total_tokens:,}") - - return config - - -@config_class() -class DiscoverDatasetsConfig(RunnableConfig): - """ - Configuration for the dataset discovery tool. - """ - - directory: pathlib.Path = Field(desc="Directory to search for datasets recursively") - output: pathlib.Path = Field(desc="Output path for the generated config YAML file") - use_file_refs: bool = Field(default=True, desc="Use file references (type: file) instead of inlining configs") - ignore_paths: list[pathlib.Path] = Field( - default_factory=list, - desc="List of paths to ignore during dataset discovery (can be absolute or relative to directory)", - ) - - def run(self): - """ - Run the dataset discovery tool. - """ - # Validate directory exists - if not self.directory.exists(): - raise ValueError(f"Directory does not exist: {self.directory}") - - if not self.directory.is_dir(): - raise ValueError(f"Path is not a directory: {self.directory}") - - # Generate the hierarchical config - config = create_hierarchical_config( - self.directory.resolve(), - use_file_refs=self.use_file_refs, - ignore_paths=self.ignore_paths, - ) - - # Write the config to the output file with header comment - self.output.parent.mkdir(parents=True, exist_ok=True) - with open(self.output, "w") as f: - # Write header comment - f.write( - "# This file was generated with tools/discover_datasets.py; weights are token-counts in billions.\n" - ) - f.write(f"# Configuration:\n") - f.write(f"# directory: {self.directory}\n") - f.write(f"# use_file_refs: {self.use_file_refs}\n") - if self.ignore_paths: - f.write(f"# ignore_paths:\n") - for ignore_path in self.ignore_paths: - f.write(f"# - {ignore_path}\n") - f.write("\n") - # Write the YAML config - yaml.safe_dump(config, f, default_flow_style=False, sort_keys=False) - - logger.info(f"Generated dataset config saved to {self.output}") - - # Print a preview of the config - logger.info("\nGenerated config preview:") - preview = yaml.safe_dump(config, default_flow_style=False, sort_keys=False) - for line in preview.split("\n")[:50]: # Show first 50 lines - logger.info(line) - - if len(preview.split("\n")) > 50: - logger.info("... (truncated)") +from fast_llm.data.preparator.dataset_discovery import DatasetDiscoveryConfig def main(): @@ -512,7 +36,7 @@ def main(): logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") # Create and run the config - config = DiscoverDatasetsConfig( + config = DatasetDiscoveryConfig( directory=args.directory, output=args.output, use_file_refs=not args.no_file_refs, @@ -529,4 +53,4 @@ def main(): if len(sys.argv) > 1 and not sys.argv[1].startswith("-") and sys.argv[1] != "--config": main() else: - DiscoverDatasetsConfig.parse_and_run() + DatasetDiscoveryConfig.parse_and_run() From b6fafcb1d5e431a5a04666b51936285c0e86ec7f Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 19:31:30 +0000 Subject: [PATCH 07/13] search for .fast_llm_dataset files --- .../preparator/dataset_discovery/config.py | 9 +- .../preparator/dataset_discovery/prepare.py | 237 +++++------------- 2 files changed, 68 insertions(+), 178 deletions(-) diff --git a/fast_llm/data/preparator/dataset_discovery/config.py b/fast_llm/data/preparator/dataset_discovery/config.py index d6765d45e..d14b5bfd8 100644 --- a/fast_llm/data/preparator/dataset_discovery/config.py +++ b/fast_llm/data/preparator/dataset_discovery/config.py @@ -14,8 +14,8 @@ class DatasetDiscoveryConfig(DatasetPreparatorConfig): """ Configuration for the dataset discovery preparator. - This preparator recursively discovers datasets in a directory and generates - a blended dataset config with weights proportional to token counts. + This preparator recursively discovers .fast_llm_dataset files in a directory + and generates a blended dataset config with weights proportional to token counts. """ directory: pathlib.Path = Field( @@ -26,11 +26,6 @@ class DatasetDiscoveryConfig(DatasetPreparatorConfig): desc="Output path for the generated config YAML file", hint=FieldHint.core, ) - use_file_refs: bool = Field( - default=True, - desc="Use file references (type: file) instead of inlining configs", - hint=FieldHint.optional, - ) ignore_paths: list[pathlib.Path] = Field( default_factory=list, desc="List of paths to ignore during dataset discovery (can be absolute or relative to directory)", diff --git a/fast_llm/data/preparator/dataset_discovery/prepare.py b/fast_llm/data/preparator/dataset_discovery/prepare.py index f054524a9..42d4f125e 100644 --- a/fast_llm/data/preparator/dataset_discovery/prepare.py +++ b/fast_llm/data/preparator/dataset_discovery/prepare.py @@ -1,8 +1,8 @@ """ Dataset discovery preparator. -This module provides functionality to recursively discover datasets in a directory -and generate a blended dataset config with weights proportional to token counts. +This module discovers datasets by directly scanning for .fast_llm_dataset files +and reading token counts from their binary headers. """ import json @@ -20,7 +20,10 @@ class DatasetDiscoveryPreparator[ConfigType: DatasetDiscoveryConfig](DatasetPreparator[ConfigType]): """ - Preparator for discovering datasets in a directory tree and generating blended configs. + Preparator for discovering datasets by scanning .fast_llm_dataset files. + + Scans a directory tree for .fast_llm_dataset files and reads token counts + from their binary headers to generate a hierarchical blended config. """ _config: DatasetDiscoveryConfig @@ -29,10 +32,9 @@ def run(self) -> None: """ Run the dataset discovery preparator. """ - # Generate the hierarchical config + # Generate the hierarchical config by finding .fast_llm_dataset files config = self._create_hierarchical_config( self._config.directory.resolve(), - use_file_refs=self._config.use_file_refs, ignore_paths=self._config.ignore_paths, ) @@ -46,7 +48,6 @@ def run(self) -> None: ) f.write(f"# Configuration:\n") f.write(f"# directory: {self._config.directory}\n") - f.write(f"# use_file_refs: {self._config.use_file_refs}\n") if self._config.ignore_paths: f.write(f"# ignore_paths:\n") for ignore_path in self._config.ignore_paths: @@ -60,7 +61,7 @@ def run(self) -> None: # Print a preview of the config logger.info("\nGenerated config preview:") preview = yaml.safe_dump(config, default_flow_style=False, sort_keys=False) - for line in preview.split("\n")[:50]: # Show first 50 lines + for line in preview.split("\n")[:50]: logger.info(line) if len(preview.split("\n")) > 50: @@ -75,18 +76,18 @@ def _is_subpath(path: pathlib.Path, parent: pathlib.Path) -> bool: except ValueError: return False - def _find_dataset_configs( + def _find_dataset_files( self, root_dir: pathlib.Path, ignore_paths: list[pathlib.Path] | None = None ) -> list[pathlib.Path]: """ - Recursively find all fast_llm_config*.yaml files in the directory tree. + Recursively find all .fast_llm_dataset files in the directory tree. Args: root_dir: Root directory to search ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) Returns: - List of paths to fast_llm_config*.yaml files + List of paths to .fast_llm_dataset files """ # Normalize ignore paths to absolute paths ignore_paths_absolute = set() @@ -97,32 +98,32 @@ def _find_dataset_configs( else: ignore_paths_absolute.add((root_dir / ignore_path).resolve()) - # Find all fast_llm_config*.yaml files and filter out ignored ones - config_files = [] - for config_file in root_dir.rglob("fast_llm_config*.yaml"): - config_file_resolved = config_file.resolve() + # Find all .fast_llm_dataset files and filter out ignored ones + dataset_files = [] + for dataset_file in root_dir.rglob("*.fast_llm_dataset"): + dataset_file_resolved = dataset_file.resolve() # Check if this file is under any ignored path is_ignored = any( - self._is_subpath(config_file_resolved, ignore_path) for ignore_path in ignore_paths_absolute + self._is_subpath(dataset_file_resolved, ignore_path) for ignore_path in ignore_paths_absolute ) if not is_ignored: - config_files.append(config_file) + dataset_files.append(dataset_file) # Sort by path for consistent ordering - return sorted(config_files) - - @staticmethod - def _load_dataset_config(config_path: pathlib.Path) -> dict: - """Load a dataset config from a YAML file.""" - with open(config_path) as f: - config = yaml.safe_load(f) - return config + return sorted(dataset_files) @staticmethod def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: - """Read number of tokens from a memmap file.""" + """Read number of tokens from a .fast_llm_dataset memmap file.""" + # Import preprocessing and sample configs to register them + import fast_llm.data.preprocessing.image_patch # noqa + import fast_llm.data.preprocessing.language_model # noqa + import fast_llm.data.sample.language_model # noqa + import fast_llm.data.sample.patch # noqa + import fast_llm.data.sample.range # noqa + import fast_llm.data.sample.token # noqa from fast_llm.data.dataset.memmap import FILE_HEADER from fast_llm.data.sample.abstract import MemmapIndexDatasetReaderConfig @@ -144,81 +145,31 @@ def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: logger.warning(f"Failed to read memmap file {memmap_path}: {e}") return 0 - @staticmethod - def _resolve_path(path: str | pathlib.Path, relative_to: pathlib.Path) -> pathlib.Path: - """Resolve a path relative to a base directory if not absolute.""" - path = pathlib.Path(path) - return path if path.is_absolute() else relative_to / path - - def _get_config_num_tokens(self, config_dict: dict, base_dir: pathlib.Path) -> int: - """Get number of tokens from a config dict (handles inline configs recursively).""" - dataset_type = config_dict.get("type") - - if dataset_type == "file": - file_path = self._resolve_path(config_dict["path"], base_dir) - return self._get_dataset_num_tokens(file_path) - - if dataset_type == "memmap": - memmap_path = self._resolve_path(config_dict.get("path", ""), base_dir) - return self._read_memmap_num_tokens(memmap_path) - - if dataset_type in ["blended", "sampled", "concatenated"]: - return sum(self._get_config_num_tokens(sub, base_dir) for sub in config_dict.get("datasets", [])) - - if dataset_type == "slice": - base_config = config_dict.get("dataset", {}) - begin = config_dict.get("begin", 0) - end = config_dict.get("end", 1) - base_tokens = self._get_config_num_tokens(base_config, base_dir) - return int(base_tokens * (end - begin)) - - logger.warning(f"Unsupported inline config type '{dataset_type}'") - return 0 - - def _get_dataset_num_tokens(self, config_path: pathlib.Path) -> int: + def _get_token_count(self, dataset_file: pathlib.Path) -> float | None: """ - Load a dataset config and get its number of tokens. - - Args: - config_path: Path to the dataset config file + Get token count in billions for a .fast_llm_dataset file. Returns: - Number of tokens in the dataset - """ - # Import preprocessing and sample configs to register them - import fast_llm.data.preprocessing.image_patch # noqa - import fast_llm.data.preprocessing.language_model # noqa - import fast_llm.data.sample.language_model # noqa - import fast_llm.data.sample.patch # noqa - import fast_llm.data.sample.range # noqa - import fast_llm.data.sample.token # noqa - - config_dict = self._load_dataset_config(config_path) - return self._get_config_num_tokens(config_dict, config_path.parent) - - def _get_token_count(self, config_file: pathlib.Path) -> float: - """ - Get token count in billions for a dataset config file. + Token count in billions, or None if the file couldn't be read """ - num_tokens = self._get_dataset_num_tokens(config_file) - logger.info(f" - {config_file.name}: {num_tokens:,} tokens") + num_tokens = self._read_memmap_num_tokens(dataset_file) + if num_tokens == 0: + logger.warning(f" - {dataset_file.name}: skipping (0 tokens or read error)") + return None + logger.debug(f" - {dataset_file.name}: {num_tokens:,} tokens") return num_tokens / 1e9 - def _create_dataset_reference(self, config_file: pathlib.Path, use_file_refs: bool) -> dict: + def _create_memmap_config_for_dataset(self, dataset_file: pathlib.Path) -> dict: """ - Create a dataset reference or inline config. + Create a memmap config dictionary for a .fast_llm_dataset file. Args: - config_file: Path to the dataset config file - use_file_refs: If True, create a file reference; if False, inline the config + dataset_file: Path to the .fast_llm_dataset file Returns: - Dictionary representing the dataset + Dictionary representing a memmap dataset config """ - if use_file_refs: - return {"type": "file", "path": str(config_file)} - else: - return self._load_dataset_config(config_file) + return {"type": "memmap", "path": str(dataset_file)} @staticmethod def _get_directory_name(directory: pathlib.Path, root_dir: pathlib.Path, suffix: str = "") -> str: @@ -237,73 +188,20 @@ def _get_directory_name(directory: pathlib.Path, root_dir: pathlib.Path, suffix: base_name = str(rel_path).replace("/", "_").replace(".", root_dir.name) return f"{base_name}{suffix}" if suffix else base_name - def _create_blended_config( - self, - config_files: list[pathlib.Path], - name: str = "blended", - use_file_refs: bool = True, - ) -> dict: - """ - Create a blended dataset config from a list of config files. - - Args: - config_files: List of paths to dataset config files - name: Name for the blended dataset - use_file_refs: If True, use file references (type: file, path: ...). - If False, inline the full configs. - - Returns: - Dictionary representing a blended dataset config - """ - if len(config_files) == 0: - raise ValueError("No config files provided") - - if len(config_files) == 1: - # If only one dataset, just reference it directly - if use_file_refs: - return { - "type": "file", - "path": str(config_files[0]), - } - else: - return self._load_dataset_config(config_files[0]) - - # Build datasets and weights in a single pass - logger.info("Calculating token counts for blended dataset weights...") - datasets = [] - weights = [] - - for config_file in config_files: - # Add dataset reference or inline config - if use_file_refs: - datasets.append({"type": "file", "path": str(config_file)}) - else: - datasets.append(self._load_dataset_config(config_file)) - - # Get token count for weight - weights.append(self._get_token_count(config_file)) - - return { - "type": "blended", - "name": name, - "datasets": datasets, - "weights": weights, - } - @staticmethod - def _group_configs_by_directory(config_files: list[pathlib.Path]) -> dict[pathlib.Path, list[pathlib.Path]]: + def _group_files_by_directory(dataset_files: list[pathlib.Path]) -> dict[pathlib.Path, list[pathlib.Path]]: """ - Group config files by their parent directory. + Group dataset files by their parent directory. Args: - config_files: List of config file paths + dataset_files: List of dataset file paths Returns: - Dictionary mapping directory paths to lists of config files in that directory + Dictionary mapping directory paths to lists of dataset files in that directory """ groups: dict[pathlib.Path, list[pathlib.Path]] = defaultdict(list) - for config_file in config_files: - groups[config_file.parent].append(config_file) + for dataset_file in dataset_files: + groups[dataset_file.parent].append(dataset_file) return dict(groups) @@ -315,7 +213,7 @@ def _build_directory_tree( Build a tree structure of directories showing parent-child relationships. Args: - groups: Dictionary mapping directories to their config files + groups: Dictionary mapping directories to their dataset files root_dir: Root directory Returns: @@ -343,17 +241,15 @@ def _create_directory_config( groups: dict[pathlib.Path, list[pathlib.Path]], tree: dict[pathlib.Path, set[pathlib.Path]], root_dir: pathlib.Path, - use_file_refs: bool, ) -> tuple[dict, float] | None: """ Recursively create a blended config for a directory and its subdirectories. Args: directory: Current directory to process - groups: Dictionary mapping directories to their config files + groups: Dictionary mapping directories to their dataset files tree: Directory tree structure root_dir: Root directory - use_file_refs: Whether to use file references Returns: Tuple of (config dictionary, total token count in billions), or None if directory has no datasets @@ -361,18 +257,20 @@ def _create_directory_config( local_datasets = [] local_tokens = [] - # Collect configs directly in this directory (not in subdirectories) + # Collect dataset files directly in this directory (not in subdirectories) if directory in groups: - for config_file in sorted(groups[directory]): - local_datasets.append(self._create_dataset_reference(config_file, use_file_refs)) - local_tokens.append(self._get_token_count(config_file)) + for dataset_file in sorted(groups[directory]): + token_count = self._get_token_count(dataset_file) + if token_count is not None: # Skip files that couldn't be read + local_datasets.append(self._create_memmap_config_for_dataset(dataset_file)) + local_tokens.append(token_count) # Recursively process subdirectories subdir_datasets = [] subdir_tokens = [] if directory in tree: for subdir in sorted(tree[directory]): - subdir_result = self._create_directory_config(subdir, groups, tree, root_dir, use_file_refs) + subdir_result = self._create_directory_config(subdir, groups, tree, root_dir) if subdir_result is not None: subdir_config, subdir_token_count = subdir_result subdir_datasets.append(subdir_config) @@ -420,54 +318,51 @@ def _create_directory_config( def _create_hierarchical_config( self, root_dir: pathlib.Path, - use_file_refs: bool = True, ignore_paths: list[pathlib.Path] | None = None, ) -> dict: """ - Create a hierarchical blended dataset config from all datasets in a directory. + Create a hierarchical blended dataset config from all .fast_llm_dataset files in a directory. Datasets in the same directory are grouped together with weights proportional to token counts, and these groups are nested following the directory structure. Args: root_dir: Root directory to search for datasets - use_file_refs: If True, use file references (type: file). - If False, inline the full configs. ignore_paths: List of paths to ignore (can be absolute or relative to root_dir) Returns: Dictionary representing the hierarchical blended dataset config """ - logger.info(f"Discovering datasets in {root_dir}...") + logger.info(f"Discovering .fast_llm_dataset files in {root_dir}...") if ignore_paths: logger.info(f"Ignoring {len(ignore_paths)} path(s):") for ignore_path in ignore_paths: logger.info(f" - {ignore_path}") - config_files = self._find_dataset_configs(root_dir, ignore_paths=ignore_paths) + dataset_files = self._find_dataset_files(root_dir, ignore_paths=ignore_paths) - if not config_files: - raise ValueError(f"No fast_llm_config*.yaml files found in {root_dir}") + if not dataset_files: + raise ValueError(f"No .fast_llm_dataset files found in {root_dir}") - logger.info(f"Found {len(config_files)} dataset config(s):") - for config_file in config_files: - logger.info(f" - {config_file.relative_to(root_dir)}") + logger.debug(f"Found {len(dataset_files)} dataset file(s):") + for dataset_file in dataset_files: + logger.debug(f" - {dataset_file.relative_to(root_dir)}") - # Group configs by directory - groups = self._group_configs_by_directory(config_files) + # Group dataset files by directory + groups = self._group_files_by_directory(dataset_files) # Build directory tree tree = self._build_directory_tree(groups, root_dir) # Create hierarchical config - result = self._create_directory_config(root_dir, groups, tree, root_dir, use_file_refs) + result = self._create_directory_config(root_dir, groups, tree, root_dir) if result is None: raise ValueError("Failed to create config") config, total_tokens = result - logger.info(f"Total tokens across all datasets: {total_tokens:,}") + logger.info(f"Total tokens across all datasets: {total_tokens:.2f}B") return config From 91923bce080e38caeaf8743a0f5259108c5604da Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 19:49:04 +0000 Subject: [PATCH 08/13] add tests --- tests/data/test_dataset_discovery.py | 363 +++++++++++++++++++++++++++ 1 file changed, 363 insertions(+) create mode 100644 tests/data/test_dataset_discovery.py diff --git a/tests/data/test_dataset_discovery.py b/tests/data/test_dataset_discovery.py new file mode 100644 index 000000000..dd8eeac46 --- /dev/null +++ b/tests/data/test_dataset_discovery.py @@ -0,0 +1,363 @@ +""" +Tests for the dataset discovery preparator. +""" + +import pathlib + +import pytest + +from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig +from fast_llm.data.preparator.dataset_discovery.prepare import DatasetDiscoveryPreparator + + +class TestDatasetDiscovery: + """Test dataset discovery that scans .fast_llm_dataset files.""" + + def test_find_dataset_files(self, tmp_path: pathlib.Path): + """Test finding .fast_llm_dataset files in directory tree.""" + # Create test directory structure + (tmp_path / "subdir1").mkdir() + (tmp_path / "subdir2").mkdir() + (tmp_path / "subdir1" / "nested").mkdir() + + # Create some .fast_llm_dataset files + (tmp_path / "dataset1.fast_llm_dataset").touch() + (tmp_path / "subdir1" / "dataset2.fast_llm_dataset").touch() + (tmp_path / "subdir1" / "nested" / "dataset3.fast_llm_dataset").touch() + (tmp_path / "subdir2" / "dataset4.fast_llm_dataset").touch() + + # Create some other files that should be ignored + (tmp_path / "readme.txt").touch() + (tmp_path / "subdir1" / "config.yaml").touch() + + # Create config + config = DatasetDiscoveryConfig( + directory=tmp_path, + output=tmp_path / "output.yaml", + ) + + # Create preparator + preparator = DatasetDiscoveryPreparator(config) + + # Find dataset files + dataset_files = preparator._find_dataset_files(tmp_path) + + # Should find all 4 .fast_llm_dataset files + assert len(dataset_files) == 4 + assert all(f.suffix == ".fast_llm_dataset" for f in dataset_files) + + def test_find_dataset_files_with_ignore(self, tmp_path: pathlib.Path): + """Test finding .fast_llm_dataset files with ignore paths.""" + # Create test directory structure + (tmp_path / "keep").mkdir() + (tmp_path / "ignore").mkdir() + + # Create dataset files + (tmp_path / "keep" / "dataset1.fast_llm_dataset").touch() + (tmp_path / "ignore" / "dataset2.fast_llm_dataset").touch() + + # Create config with ignore path + config = DatasetDiscoveryConfig( + directory=tmp_path, + output=tmp_path / "output.yaml", + ignore_paths=[pathlib.Path("ignore")], + ) + + # Create preparator + preparator = DatasetDiscoveryPreparator(config) + + # Find dataset files + dataset_files = preparator._find_dataset_files(tmp_path, ignore_paths=config.ignore_paths) + + # Should find only 1 file (dataset2 should be ignored) + assert len(dataset_files) == 1 + assert dataset_files[0].name == "dataset1.fast_llm_dataset" + + def test_group_files_by_directory(self, tmp_path: pathlib.Path): + """Test grouping dataset files by directory.""" + # Create files + files = [ + tmp_path / "dataset1.fast_llm_dataset", + tmp_path / "dataset2.fast_llm_dataset", + tmp_path / "subdir" / "dataset3.fast_llm_dataset", + ] + + # Group by directory + groups = DatasetDiscoveryPreparator._group_files_by_directory(files) + + # Should have 2 groups + assert len(groups) == 2 + assert len(groups[tmp_path]) == 2 + assert len(groups[tmp_path / "subdir"]) == 1 + + def test_build_directory_tree(self, tmp_path: pathlib.Path): + """Test building directory tree.""" + # Create nested directories + (tmp_path / "a" / "b" / "c").mkdir(parents=True) + + # Create groups + groups = { + tmp_path: [], + tmp_path / "a": [], + tmp_path / "a" / "b": [], + tmp_path / "a" / "b" / "c": [], + } + + # Build tree + tree = DatasetDiscoveryPreparator._build_directory_tree(groups, tmp_path) + + # Verify tree structure + assert tmp_path / "a" in tree[tmp_path] + assert tmp_path / "a" / "b" in tree[tmp_path / "a"] + assert tmp_path / "a" / "b" / "c" in tree[tmp_path / "a" / "b"] + + def test_create_memmap_config(self, tmp_path: pathlib.Path): + """Test creating memmap config for dataset file.""" + dataset_file = tmp_path / "dataset.fast_llm_dataset" + dataset_file.touch() + + config = DatasetDiscoveryConfig( + directory=tmp_path, + output=tmp_path / "output.yaml", + ) + preparator = DatasetDiscoveryPreparator(config) + + # Create config + memmap_config = preparator._create_memmap_config_for_dataset(dataset_file) + + # Verify config structure + assert memmap_config["type"] == "memmap" + assert memmap_config["path"] == str(dataset_file) + + def test_get_directory_name(self, tmp_path: pathlib.Path): + """Test directory naming.""" + root = tmp_path + subdir = tmp_path / "data" / "train" + + # Test root directory + name = DatasetDiscoveryPreparator._get_directory_name(root, root) + assert name == root.name + + # Test subdirectory + name = DatasetDiscoveryPreparator._get_directory_name(subdir, root) + assert name == "data_train" + + # Test with suffix + name = DatasetDiscoveryPreparator._get_directory_name(subdir, root, "_local") + assert name == "data_train_local" + + @pytest.mark.slow + def test_dataset_discovery_e2e_single_dataset(self, tmp_path: pathlib.Path): + """Test end-to-end discovery with a single dataset.""" + import shutil + + import yaml + + from tests.utils.dataset import get_common_test_dataset + + # Get a prepared test dataset + dataset_path, _, _, _ = get_common_test_dataset() + + # Copy the .fast_llm_dataset file to temp directory + dataset_files = list(dataset_path.glob("*.fast_llm_dataset")) + assert len(dataset_files) > 0, "No dataset files found in test dataset" + + test_dataset = dataset_files[0] + (tmp_path / "datasets").mkdir() + shutil.copy(test_dataset, tmp_path / "datasets" / "dataset.fast_llm_dataset") + + # Run dataset discovery + output_path = tmp_path / "discovered_config.yaml" + config = DatasetDiscoveryConfig( + directory=tmp_path / "datasets", + output=output_path, + ) + config.run() + + # Verify output file was created + assert output_path.exists() + + # Load and verify the generated config + with open(output_path) as f: + content = f.read() + # Check header comments + assert "# This file was generated with fast_llm.data.preparator.dataset_discovery" in content + assert "weights are token-counts in billions" in content + assert f"# directory: {tmp_path / 'datasets'}" in content + + # Parse YAML + f.seek(0) + generated_config = yaml.safe_load(f) + + # Single dataset should be returned directly (not blended) + assert generated_config["type"] == "memmap" + assert "dataset.fast_llm_dataset" in generated_config["path"] + + @pytest.mark.slow + def test_dataset_discovery_e2e_multiple_datasets(self, tmp_path: pathlib.Path): + """Test end-to-end discovery with multiple datasets in flat structure.""" + import shutil + + import yaml + + from tests.utils.dataset import get_alt_test_dataset, get_common_test_dataset + + # Get two different test datasets + dataset1_path, _, _, _ = get_common_test_dataset() + dataset2_path, _, _, _ = get_alt_test_dataset() + + # Copy dataset files to temp directory + (tmp_path / "datasets").mkdir() + dataset1_file = list(dataset1_path.glob("*.fast_llm_dataset"))[0] + dataset2_file = list(dataset2_path.glob("*.fast_llm_dataset"))[0] + + shutil.copy(dataset1_file, tmp_path / "datasets" / "dataset1.fast_llm_dataset") + shutil.copy(dataset2_file, tmp_path / "datasets" / "dataset2.fast_llm_dataset") + + # Run dataset discovery + output_path = tmp_path / "discovered_config.yaml" + config = DatasetDiscoveryConfig( + directory=tmp_path / "datasets", + output=output_path, + ) + config.run() + + # Verify output file was created + assert output_path.exists() + + # Load and verify the generated config + with open(output_path) as f: + generated_config = yaml.safe_load(f) + + # Multiple datasets should create a blended config + assert generated_config["type"] == "blended" + assert len(generated_config["datasets"]) == 2 + assert len(generated_config["weights"]) == 2 + + # Verify all weights are positive (in billions) + assert all(w > 0 for w in generated_config["weights"]) + + # Verify datasets are memmap configs + for dataset_config in generated_config["datasets"]: + assert dataset_config["type"] == "memmap" + assert "dataset" in dataset_config["path"] + + @pytest.mark.slow + def test_dataset_discovery_e2e_hierarchical_structure(self, tmp_path: pathlib.Path): + """Test end-to-end discovery with hierarchical directory structure.""" + import shutil + + import yaml + + from tests.utils.dataset import get_alt_test_dataset, get_common_test_dataset + + # Get test datasets + dataset1_path, _, _, _ = get_common_test_dataset() + dataset2_path, _, _, _ = get_alt_test_dataset() + + # Create hierarchical structure + (tmp_path / "root").mkdir() + (tmp_path / "root" / "group1").mkdir() + (tmp_path / "root" / "group2").mkdir() + + dataset1_file = list(dataset1_path.glob("*.fast_llm_dataset"))[0] + dataset2_file = list(dataset2_path.glob("*.fast_llm_dataset"))[0] + + # Place datasets in hierarchy + shutil.copy(dataset1_file, tmp_path / "root" / "dataset_a.fast_llm_dataset") + shutil.copy(dataset2_file, tmp_path / "root" / "dataset_b.fast_llm_dataset") + shutil.copy(dataset1_file, tmp_path / "root" / "group1" / "dataset_c.fast_llm_dataset") + shutil.copy(dataset2_file, tmp_path / "root" / "group2" / "dataset_d.fast_llm_dataset") + + # Run dataset discovery + output_path = tmp_path / "discovered_config.yaml" + config = DatasetDiscoveryConfig( + directory=tmp_path / "root", + output=output_path, + ) + config.run() + + # Load and verify the generated config + with open(output_path) as f: + generated_config = yaml.safe_load(f) + + # Should create hierarchical blended config + assert generated_config["type"] == "blended" + + # Root should have 3 items: local group + 2 subdirs + assert len(generated_config["datasets"]) == 3 + + # First item should be local datasets grouped with "_local" suffix + local_group = generated_config["datasets"][0] + assert local_group["type"] == "blended" + assert "_local" in local_group["name"] + assert len(local_group["datasets"]) == 2 + + # Next two should be subdirectory datasets (single dataset each, so memmap type) + # Check that one is from group1 and one from group2 + subdir_paths = [generated_config["datasets"][1]["path"], generated_config["datasets"][2]["path"]] + assert any("group1" in path for path in subdir_paths) + assert any("group2" in path for path in subdir_paths) + + @pytest.mark.slow + def test_dataset_discovery_e2e_with_ignore_paths(self, tmp_path: pathlib.Path): + """Test end-to-end discovery with ignore_paths.""" + import shutil + + import yaml + + from tests.utils.dataset import get_common_test_dataset + + # Get test dataset + dataset_path, _, _, _ = get_common_test_dataset() + dataset_file = list(dataset_path.glob("*.fast_llm_dataset"))[0] + + # Create directory structure + (tmp_path / "datasets" / "keep").mkdir(parents=True) + (tmp_path / "datasets" / "ignore").mkdir(parents=True) + + # Place datasets + shutil.copy(dataset_file, tmp_path / "datasets" / "keep" / "dataset1.fast_llm_dataset") + shutil.copy(dataset_file, tmp_path / "datasets" / "ignore" / "dataset2.fast_llm_dataset") + + # Run dataset discovery with ignore_paths + output_path = tmp_path / "discovered_config.yaml" + config = DatasetDiscoveryConfig( + directory=tmp_path / "datasets", + output=output_path, + ignore_paths=[pathlib.Path("ignore")], + ) + config.run() + + # Load and verify the generated config + with open(output_path) as f: + content = f.read() + # Check ignore_paths in header + assert "ignore_paths:" in content + assert "ignore" in content + + # Parse YAML + f.seek(0) + generated_config = yaml.safe_load(f) + + # Should only include the dataset from "keep" directory + # Single dataset, so should be memmap (not blended) + assert generated_config["type"] == "memmap" + assert "keep" in generated_config["path"] + assert "ignore" not in generated_config["path"] + + @pytest.mark.slow + def test_dataset_discovery_e2e_empty_directory(self, tmp_path: pathlib.Path): + """Test that discovery fails gracefully on empty directory.""" + # Create empty directory + (tmp_path / "empty").mkdir() + + # Run dataset discovery - should raise ValueError + output_path = tmp_path / "output.yaml" + config = DatasetDiscoveryConfig( + directory=tmp_path / "empty", + output=output_path, + ) + + with pytest.raises(ValueError, match="No .fast_llm_dataset files found"): + config.run() From e5c03cbc861ead801e560cedd23f5ac43c33abda Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 19:51:20 +0000 Subject: [PATCH 09/13] move readme --- .../preparator/dataset_discovery/README.md | 383 ++++++++++++++++++ tools/README_discover_datasets.md | 267 ------------ 2 files changed, 383 insertions(+), 267 deletions(-) create mode 100644 fast_llm/data/preparator/dataset_discovery/README.md delete mode 100644 tools/README_discover_datasets.md diff --git a/fast_llm/data/preparator/dataset_discovery/README.md b/fast_llm/data/preparator/dataset_discovery/README.md new file mode 100644 index 000000000..8e7e6d590 --- /dev/null +++ b/fast_llm/data/preparator/dataset_discovery/README.md @@ -0,0 +1,383 @@ +# Dataset Discovery + +A tool to automatically discover `.fast_llm_dataset` files in a directory tree and generate blended configuration files with weights proportional to token counts. + +## Overview + +The dataset discovery preparator walks through a directory tree, identifies datasets by their `.fast_llm_dataset` files, and generates a configuration file that blends all discovered datasets with weights proportional to token counts. + +### How It Works + +1. Recursively scans a directory for `.fast_llm_dataset` files +2. Reads token counts directly from the binary file headers (no YAML parsing needed) +3. Generates a hierarchical blended config with weights proportional to token counts +4. Groups datasets by directory structure + +This is useful when you have a collection of prepared datasets and want to automatically create a blended training config without manually specifying each dataset and its weight. + +## Features + +- **Automatic Discovery**: Recursively finds all `.fast_llm_dataset` files in nested directories +- **Direct Token Reading**: Reads token counts from binary file headers (fast and reliable) +- **Hierarchical Blending**: Preserves directory structure in generated config +- **Token-Proportional Weights**: Automatically calculates weights based on token counts for proportional sampling +- **Ignore Paths**: Exclude specific directories from discovery +- **Robust Error Handling**: Gracefully handles missing/invalid files + +## Usage + +### Command Line (using tools wrapper) + +```bash +python tools/discover_datasets.py -o [options] +``` + +**Arguments:** + +- `directory`: Directory to search for datasets recursively (required) +- `-o, --output`: Output path for the generated config YAML file (required) +- `--ignore`: Path to ignore during dataset discovery (can be specified multiple times, optional) + +**Examples:** + +```bash +# Basic usage - discover all datasets and create blended config +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml + +# Ignore specific paths during discovery +python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --ignore experiments/old --ignore tmp +``` + +### Config File + +Create a YAML config file: + +```yaml +# discover_config.yaml +type: prepare_dataset_discovery +directory: /path/to/datasets +output: blended_dataset.yaml +ignore_paths: # Optional + - ./test_data + - ./checkpoints +``` + +Run with Fast-LLM's config system: + +```bash +python -m fast_llm.engine.config_utils.run --config discover_config.yaml +``` + +Or directly via command line: + +```bash +python -m fast_llm.engine.config_utils.run prepare_dataset_discovery \ + --directory /path/to/datasets \ + --output discovered_datasets.yaml +``` + +Or using the tools wrapper with config: + +```bash +python tools/discover_datasets.py --config discover_config.yaml +``` + +## Configuration Options + +- **directory** (required): Root directory to scan for `.fast_llm_dataset` files +- **output** (required): Where to save the generated blended config YAML +- **ignore_paths** (optional): List of paths to exclude from discovery + - Can be absolute paths or relative to the directory + - Any datasets under these paths will be skipped + +## Dataset Identification + +The tool identifies datasets by looking for files with the `.fast_llm_dataset` extension. These files are binary memmap files created by Fast-LLM's dataset preparation commands. + +Unlike the old implementation, this tool: +- ❌ Does NOT look for `fast_llm_config*.yaml` files +- ✅ Directly scans for `.fast_llm_dataset` binary files +- ✅ Reads metadata from binary file headers (faster, more reliable) + +## Output Format + +### Hierarchical Blended Config + +The tool generates a hierarchical blended config that mirrors your directory structure: + +```yaml +type: blended +name: root_directory +datasets: + - type: blended + name: domain_a_local # Local datasets grouped + datasets: + - type: memmap + path: /path/to/datasets/domain_a/shard_0.fast_llm_dataset + - type: memmap + path: /path/to/datasets/domain_a/shard_1.fast_llm_dataset + weights: [1.0, 1.0] # Token counts in billions + - type: memmap + path: /path/to/datasets/domain_b/shard_0.fast_llm_dataset # Single dataset (not wrapped) +weights: [2.0, 3.0] # Total tokens per group in billions +``` + +### Directory Structure Example + +Given this file structure: +``` +datasets/ +├── domain_a/ +│ ├── shard_0.fast_llm_dataset (1B tokens) +│ └── shard_1.fast_llm_dataset (1B tokens) +├── domain_b/ +│ ├── shard_0.fast_llm_dataset (2B tokens) +│ └── shard_1.fast_llm_dataset (2B tokens) +└── domain_c/ + └── shard_0.fast_llm_dataset (4B tokens) +``` + +The generated config will blend: +- `domain_a`: 2B tokens total (20% of samples) +- `domain_b`: 4B tokens total (40% of samples) +- `domain_c`: 4B tokens total (40% of samples) + +### Blended Datasets Explained + +With blended datasets, samples are drawn from each dataset proportionally to their weights during training: + +- **Proportional sampling**: Larger datasets (more tokens) are sampled more frequently +- **Interleaved samples**: Unlike concatenation, samples from different datasets are mixed +- **Automatic weights**: Calculated from token counts - no manual specification needed +- **Hierarchical weighting**: Subdirectories are weighted by their total token count + +### Using in Training Config + +The generated config can be used directly in a training config: + +```yaml +data: + datasets: + training: + type: file + path: blended_dataset.yaml +``` + +## Example Workflow + +### 1. Prepare Multiple Datasets + +```bash +# Prepare dataset 1 +fast-llm prepare --config dataset1_prepare.yaml + +# Prepare dataset 2 +fast-llm prepare --config dataset2_prepare.yaml + +# Prepare dataset 3 +fast-llm prepare --config dataset3_prepare.yaml +``` + +This creates a directory structure like: + +``` +my_datasets/ +├── dataset1/ +│ ├── shard_0_0.fast_llm_dataset +│ └── fast_llm_config.yaml +├── dataset2/ +│ ├── shard_0_0.fast_llm_dataset +│ └── fast_llm_config.yaml +└── dataset3/ + ├── shard_0_0.fast_llm_dataset + └── fast_llm_config.yaml +``` + +### 2. Discover and Blend Datasets + +```bash +python tools/discover_datasets.py my_datasets/ -o blended_datasets.yaml +``` + +This generates `blended_datasets.yaml`: + +```yaml +# This file was generated with fast_llm.data.preparator.dataset_discovery; weights are token-counts in billions. +# Configuration: +# directory: my_datasets/ + +type: blended +name: my_datasets +datasets: + - type: memmap + path: my_datasets/dataset1/shard_0_0.fast_llm_dataset + - type: memmap + path: my_datasets/dataset2/shard_0_0.fast_llm_dataset + - type: memmap + path: my_datasets/dataset3/shard_0_0.fast_llm_dataset +weights: + - 1.5 # Dataset 1: 1.5B tokens + - 2.0 # Dataset 2: 2B tokens + - 3.0 # Dataset 3: 3B tokens +``` + +### 3. Use in Training Config + +```yaml +# training_config.yaml +model: + # ... model config ... + +data: + datasets: + training: + type: file + path: blended_datasets.yaml + sampling: + shuffle: skip_first_epoch + seed: 784569 + +# ... rest of training config ... +``` + +### 4. Train + +```bash +fast-llm train --config training_config.yaml +``` + +## Use Cases + +### 1. Combining Multiple Data Sources + +You have data from different sources (web scrapes, books, code, etc.) prepared separately: + +```bash +python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml +``` + +### 2. Incremental Data Addition + +You keep adding new datasets over time and want to automatically include all of them: + +```bash +# Just add new prepared datasets to the directory +# Re-run discovery to update the combined config +python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml +``` + +### 3. Experiment Organization + +You have experiments with different preprocessing or filtering: + +``` +experiments/ +├── baseline/ +│ └── dataset.fast_llm_dataset +├── filtered_v1/ +│ └── dataset.fast_llm_dataset +└── filtered_v2/ + └── dataset.fast_llm_dataset +``` + +```bash +python tools/discover_datasets.py experiments/ -o all_experiments.yaml +``` + +## Error Handling + +The tool gracefully handles errors: + +- **Missing files**: Warns and skips (returns 0 tokens) +- **Invalid format**: Warns and skips (returns 0 tokens) +- **Read errors**: Warns and skips (returns 0 tokens) +- **Files with 0 tokens**: Excluded from final config with warning + +All warnings are logged so you can see which files were skipped. + +## Implementation Details + +### File Format + +`.fast_llm_dataset` files are binary memmap files with this structure: +``` +[Header: "fast_llm_prepared_dataset"] +[Pointer to config: 8 bytes] +[Data: variable length] +[Config length: 4 bytes] +[Config JSON: variable length] +``` + +The config JSON contains metadata including `num_tokens`, which is used for weighting. + +### Key Methods + +- `_find_dataset_files()` - Recursively find `.fast_llm_dataset` files +- `_read_memmap_num_tokens()` - Read token count from binary file header +- `_create_memmap_config_for_dataset()` - Generate memmap config dict +- `_create_hierarchical_config()` - Build nested blended config tree +- `_create_directory_config()` - Recursively process directory structure +- `_group_files_by_directory()` - Group files by parent directory +- `_build_directory_tree()` - Build parent-child directory relationships + +### Required Imports + +The tool imports several config registries to ensure proper deserialization: +- `fast_llm.data.preprocessing.image_patch` +- `fast_llm.data.preprocessing.language_model` +- `fast_llm.data.sample.language_model` +- `fast_llm.data.sample.patch` +- `fast_llm.data.sample.range` +- `fast_llm.data.sample.token` + +These are loaded when reading each file to ensure the config types are registered. + +## Testing + +Run tests: + +```bash +pytest tests/data/test_dataset_discovery.py +``` + +Tests cover: +- Unit tests for helper methods (file discovery, grouping, tree building) +- End-to-end tests with actual prepared datasets +- Hierarchical directory structures +- Ignore paths functionality +- Error handling (empty directories, invalid files) + +## Notes + +- **Absolute Paths**: The tool uses absolute paths for memmap files to ensure configs work regardless of where they're used from + +- **Ordering**: Datasets are discovered and ordered alphabetically by path for consistency + +- **Single Dataset**: If only one `.fast_llm_dataset` file is found, it's returned directly (not wrapped in a blended config) + +- **Empty Directories**: If no `.fast_llm_dataset` files are found, the tool will raise an error + +- **All Files Included**: The tool discovers ALL `.fast_llm_dataset` files in the directory tree. Use `--ignore` to exclude specific paths if needed + +## Benefits + +- **Automatic weighting**: No manual calculation of token counts needed +- **Hierarchical organization**: Preserves directory structure in config +- **Self-contained**: Reads all metadata from dataset files themselves +- **Simple**: Direct file scanning, no complex YAML parsing +- **Robust**: Binary format is well-defined, graceful error handling +- **Fast**: Direct header reads, no full dataset loading + +## Limitations + +- Only works with `.fast_llm_dataset` files (not arbitrary configs) +- Always generates memmap configs (doesn't support complex config composition) +- No slicing or sampling during discovery (use separate tools for that) + +For most workflows (processing datasets then training on them), these limitations are not relevant. + +## See Also + +- [Fast-LLM Data Configuration Documentation](../../../docs/recipes/data-configuration.md) +- [Dataset Preparation Guide](../../../docs/recipes/data-preparation.md) +- [GPT Memmap Preparator](../gpt_memmap/) diff --git a/tools/README_discover_datasets.md b/tools/README_discover_datasets.md deleted file mode 100644 index af0b21819..000000000 --- a/tools/README_discover_datasets.md +++ /dev/null @@ -1,267 +0,0 @@ -# Dataset Discovery Tool - -A tool to recursively discover datasets in a directory and generate a blended dataset configuration for Fast-LLM. - -## Overview - -This tool walks through a directory tree, identifies datasets by their `fast_llm_config*.yaml` files, and generates a configuration file that blends all discovered datasets with weights proportional to token counts. - -## Features - -- **Recursive Discovery**: Automatically finds all dataset configs in nested directories -- **Flexible Output**: Can use file references or inline full configs -- **Token-Proportional Blending**: Automatically calculates weights based on dataset token counts for proportional sampling - -## Usage - -### Command Line - -```bash -python tools/discover_datasets.py -o [options] -``` - -**Arguments:** - -- `directory`: Directory to search for datasets recursively (required) -- `-o, --output`: Output path for the generated config YAML file (required) -- `--no-file-refs`: Inline configs instead of using file references (optional, not recommended) -- `--ignore`: Path to ignore during dataset discovery (can be specified multiple times, optional) - -**Examples:** - -```bash -# Basic usage - discover all datasets and create blended config -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml - -# Inline full configs instead of using file references -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --no-file-refs - -# Ignore specific paths during discovery -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --ignore experiments/old --ignore tmp -``` - -### Config File - -Create a config file: - -```yaml -# discover_config.yaml -directory: /path/to/datasets -output: blended_dataset.yaml -use_file_refs: true -ignore_paths: [] # Optional list of paths to ignore -``` - -Run with: - -```bash -python tools/discover_datasets.py --config discover_config.yaml -``` - -## Dataset Identification - -The tool identifies datasets by looking for files matching the pattern `fast_llm_config*.yaml`: - -- `fast_llm_config.yaml` - Unsplit dataset -- `fast_llm_config_training.yaml` - Training split -- `fast_llm_config_validation.yaml` - Validation split -- Any other `fast_llm_config_*.yaml` files - -These files are typically generated by the `fast-llm prepare` command during dataset preparation. - -## Output Format - -### Blended Datasets - -The tool generates a blended dataset config with weights proportional to the number of tokens in each dataset: - -```yaml -type: blended -name: my_datasets -datasets: - - type: file - path: /path/to/dataset1/fast_llm_config_training.yaml - - type: file - path: /path/to/dataset1/fast_llm_config_validation.yaml - - type: file - path: /path/to/dataset2/fast_llm_config.yaml -weights: - - 1.5 # Dataset 1 has 1.5B tokens - - 0.5 # Dataset 2 has 0.5B tokens - - 2.0 # Dataset 3 has 2.0B tokens -``` - -With blended datasets, samples are drawn from each dataset proportionally to their weights during training. This means: - -- Larger datasets (more tokens) will be sampled more frequently -- Smaller datasets will be sampled less frequently -- The sampling is interleaved, not sequential -- Each dataset maintains its internal order, but samples from different datasets are mixed - -**Hierarchical blending:** When datasets are in nested directories, the tool automatically calculates proper token-proportional weights at all levels. Subdirectories are weighted by their total token count (sum of all datasets within them), ensuring accurate proportional sampling across the entire directory structure. - -**Benefits of blended datasets:** - -- **Proportional sampling**: Each dataset is sampled proportionally to its size, preventing smaller datasets from being underrepresented -- **Interleaved samples**: Unlike sequential concatenation, samples from different datasets are mixed during training -- **Automatic weight calculation**: No need to manually specify weights - they're calculated from token counts - -### Using in Training Config - -The generated config can be used directly in a training config: - -```yaml -data: - datasets: - training: - type: file - path: blended_dataset.yaml -``` - -## Example Workflow - -### 1. Prepare Multiple Datasets - -```bash -# Prepare dataset 1 -fast-llm prepare --config dataset1_prepare.yaml - -# Prepare dataset 2 -fast-llm prepare --config dataset2_prepare.yaml - -# Prepare dataset 3 -fast-llm prepare --config dataset3_prepare.yaml -``` - -This creates a directory structure like: - -``` -my_datasets/ -├── dataset1/ -│ ├── fast_llm_config_training.yaml -│ ├── fast_llm_config_validation.yaml -│ ├── dataset1_training.fast_llm_dataset -│ └── dataset1_validation.fast_llm_dataset -├── dataset2/ -│ ├── fast_llm_config_training.yaml -│ ├── fast_llm_config_validation.yaml -│ ├── dataset2_training.fast_llm_dataset -│ └── dataset2_validation.fast_llm_dataset -└── dataset3/ - └── experiments/ - ├── fast_llm_config_training.yaml - └── dataset3_training.fast_llm_dataset -``` - -### 2. Discover and Blend Datasets - -```bash -python tools/discover_datasets.py my_datasets/ -o blended_datasets.yaml -``` - -This generates `blended_datasets.yaml`: - -```yaml -type: blended -name: my_datasets -datasets: - - type: file - path: my_datasets/dataset1/fast_llm_config_training.yaml - - type: file - path: my_datasets/dataset1/fast_llm_config_validation.yaml - - type: file - path: my_datasets/dataset2/fast_llm_config_training.yaml - - type: file - path: my_datasets/dataset2/fast_llm_config_validation.yaml - - type: file - path: my_datasets/dataset3/experiments/fast_llm_config_training.yaml -weights: - - 1500.0 # Dataset 1 training: 1.5B tokens - - 500.0 # Dataset 1 validation: 500M tokens - - 2000.0 # Dataset 2 training: 2B tokens - - 800.0 # Dataset 2 validation: 800M tokens - - 3000.0 # Dataset 3 training: 3B tokens -``` - -### 3. Use in Training Config - -```yaml -# training_config.yaml -model: - # ... model config ... - -data: - datasets: - training: - type: file - path: blended_datasets.yaml - sampling: - shuffle: skip_first_epoch - seed: 784569 - -# ... rest of training config ... -``` - -### 4. Train - -```bash -fast-llm train --config training_config.yaml -``` - -## Use Cases - -### 1. Combining Multiple Data Sources - -You have data from different sources (web scrapes, books, code, etc.) prepared separately: - -```bash -python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml -``` - -### 2. Incremental Data Addition - -You keep adding new datasets over time and want to automatically include all of them: - -```bash -# Just add new prepared datasets to the directory -# Re-run discovery to update the combined config -python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml -``` - -### 3. Experiment Organization - -You have experiments with different preprocessing or filtering: - -``` -experiments/ -├── baseline/ -│ ├── fast_llm_config_training.yaml -│ └── fast_llm_config_validation.yaml -├── filtered_v1/ -│ ├── fast_llm_config_training.yaml -│ └── fast_llm_config_validation.yaml -└── filtered_v2/ - ├── fast_llm_config_training.yaml - └── fast_llm_config_validation.yaml -``` - -```bash -python tools/discover_datasets.py experiments/ -o all_experiments.yaml -``` - -## Notes - -- **File References**: By default, the tool uses `type: file` references which lazily load the actual dataset configs. This keeps the generated config small and readable. - -- **Absolute Paths**: The tool uses absolute paths for file references to ensure configs work regardless of where they're used from. - -- **Ordering**: Datasets are discovered and ordered alphabetically by path for consistency. - -- **Empty Directories**: If no `fast_llm_config*.yaml` files are found, the tool will raise an error. - -- **All Files Included**: The tool blends ALL discovered config files with weights proportional to their token counts. This means if you have both training and validation configs in the same directory, they will all be included in the blended dataset. You may want to organize your directory structure accordingly or use the `--ignore` flag to exclude specific paths. - -## See Also - -- [Fast-LLM Data Configuration Documentation](../docs/recipes/data-configuration.md) -- [Dataset Preparation Guide](../docs/recipes/data-preparation.md) From 1687c900d6a3345ea486a87008bd63e1cfb6f206 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 20:02:44 +0000 Subject: [PATCH 10/13] update readme --- .../preparator/dataset_discovery/README.md | 374 ++---------------- 1 file changed, 42 insertions(+), 332 deletions(-) diff --git a/fast_llm/data/preparator/dataset_discovery/README.md b/fast_llm/data/preparator/dataset_discovery/README.md index 8e7e6d590..70cf6628f 100644 --- a/fast_llm/data/preparator/dataset_discovery/README.md +++ b/fast_llm/data/preparator/dataset_discovery/README.md @@ -1,383 +1,93 @@ # Dataset Discovery -A tool to automatically discover `.fast_llm_dataset` files in a directory tree and generate blended configuration files with weights proportional to token counts. +Automatically discover `.fast_llm_dataset` files and generate a blended config with token-proportional weights. -## Overview - -The dataset discovery preparator walks through a directory tree, identifies datasets by their `.fast_llm_dataset` files, and generates a configuration file that blends all discovered datasets with weights proportional to token counts. - -### How It Works - -1. Recursively scans a directory for `.fast_llm_dataset` files -2. Reads token counts directly from the binary file headers (no YAML parsing needed) -3. Generates a hierarchical blended config with weights proportional to token counts -4. Groups datasets by directory structure - -This is useful when you have a collection of prepared datasets and want to automatically create a blended training config without manually specifying each dataset and its weight. - -## Features - -- **Automatic Discovery**: Recursively finds all `.fast_llm_dataset` files in nested directories -- **Direct Token Reading**: Reads token counts from binary file headers (fast and reliable) -- **Hierarchical Blending**: Preserves directory structure in generated config -- **Token-Proportional Weights**: Automatically calculates weights based on token counts for proportional sampling -- **Ignore Paths**: Exclude specific directories from discovery -- **Robust Error Handling**: Gracefully handles missing/invalid files - -## Usage - -### Command Line (using tools wrapper) +## Quick Start ```bash -python tools/discover_datasets.py -o [options] +python tools/discover_datasets.py -o ``` -**Arguments:** - -- `directory`: Directory to search for datasets recursively (required) -- `-o, --output`: Output path for the generated config YAML file (required) -- `--ignore`: Path to ignore during dataset discovery (can be specified multiple times, optional) - -**Examples:** - -```bash -# Basic usage - discover all datasets and create blended config -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml - -# Ignore specific paths during discovery -python tools/discover_datasets.py /path/to/datasets -o blended_dataset.yaml --ignore experiments/old --ignore tmp -``` - -### Config File - -Create a YAML config file: - +Or with config file: ```yaml -# discover_config.yaml type: prepare_dataset_discovery directory: /path/to/datasets output: blended_dataset.yaml -ignore_paths: # Optional - - ./test_data - - ./checkpoints +ignore_paths: [test_data, checkpoints] # Optional ``` -Run with Fast-LLM's config system: - ```bash -python -m fast_llm.engine.config_utils.run --config discover_config.yaml +python -m fast_llm.engine.config_utils.run --config config.yaml ``` -Or directly via command line: +## What It Does -```bash -python -m fast_llm.engine.config_utils.run prepare_dataset_discovery \ - --directory /path/to/datasets \ - --output discovered_datasets.yaml -``` +1. Scans directory tree for `.fast_llm_dataset` files +2. Reads token counts from binary file headers +3. Generates hierarchical blended config with automatic weights +4. Preserves directory structure -Or using the tools wrapper with config: +## Example -```bash -python tools/discover_datasets.py --config discover_config.yaml -``` - -## Configuration Options - -- **directory** (required): Root directory to scan for `.fast_llm_dataset` files -- **output** (required): Where to save the generated blended config YAML -- **ignore_paths** (optional): List of paths to exclude from discovery - - Can be absolute paths or relative to the directory - - Any datasets under these paths will be skipped - -## Dataset Identification - -The tool identifies datasets by looking for files with the `.fast_llm_dataset` extension. These files are binary memmap files created by Fast-LLM's dataset preparation commands. - -Unlike the old implementation, this tool: -- ❌ Does NOT look for `fast_llm_config*.yaml` files -- ✅ Directly scans for `.fast_llm_dataset` binary files -- ✅ Reads metadata from binary file headers (faster, more reliable) - -## Output Format - -### Hierarchical Blended Config - -The tool generates a hierarchical blended config that mirrors your directory structure: - -```yaml -type: blended -name: root_directory -datasets: - - type: blended - name: domain_a_local # Local datasets grouped - datasets: - - type: memmap - path: /path/to/datasets/domain_a/shard_0.fast_llm_dataset - - type: memmap - path: /path/to/datasets/domain_a/shard_1.fast_llm_dataset - weights: [1.0, 1.0] # Token counts in billions - - type: memmap - path: /path/to/datasets/domain_b/shard_0.fast_llm_dataset # Single dataset (not wrapped) -weights: [2.0, 3.0] # Total tokens per group in billions -``` - -### Directory Structure Example - -Given this file structure: +Input directory structure: ``` datasets/ ├── domain_a/ │ ├── shard_0.fast_llm_dataset (1B tokens) │ └── shard_1.fast_llm_dataset (1B tokens) -├── domain_b/ -│ ├── shard_0.fast_llm_dataset (2B tokens) -│ └── shard_1.fast_llm_dataset (2B tokens) -└── domain_c/ +└── domain_b/ └── shard_0.fast_llm_dataset (4B tokens) ``` -The generated config will blend: -- `domain_a`: 2B tokens total (20% of samples) -- `domain_b`: 4B tokens total (40% of samples) -- `domain_c`: 4B tokens total (40% of samples) - -### Blended Datasets Explained - -With blended datasets, samples are drawn from each dataset proportionally to their weights during training: - -- **Proportional sampling**: Larger datasets (more tokens) are sampled more frequently -- **Interleaved samples**: Unlike concatenation, samples from different datasets are mixed -- **Automatic weights**: Calculated from token counts - no manual specification needed -- **Hierarchical weighting**: Subdirectories are weighted by their total token count - -### Using in Training Config - -The generated config can be used directly in a training config: - -```yaml -data: - datasets: - training: - type: file - path: blended_dataset.yaml -``` - -## Example Workflow - -### 1. Prepare Multiple Datasets - -```bash -# Prepare dataset 1 -fast-llm prepare --config dataset1_prepare.yaml - -# Prepare dataset 2 -fast-llm prepare --config dataset2_prepare.yaml - -# Prepare dataset 3 -fast-llm prepare --config dataset3_prepare.yaml -``` - -This creates a directory structure like: - -``` -my_datasets/ -├── dataset1/ -│ ├── shard_0_0.fast_llm_dataset -│ └── fast_llm_config.yaml -├── dataset2/ -│ ├── shard_0_0.fast_llm_dataset -│ └── fast_llm_config.yaml -└── dataset3/ - ├── shard_0_0.fast_llm_dataset - └── fast_llm_config.yaml -``` - -### 2. Discover and Blend Datasets - -```bash -python tools/discover_datasets.py my_datasets/ -o blended_datasets.yaml -``` - -This generates `blended_datasets.yaml`: - +Generated config (`blended.yaml`): ```yaml -# This file was generated with fast_llm.data.preparator.dataset_discovery; weights are token-counts in billions. -# Configuration: -# directory: my_datasets/ - type: blended -name: my_datasets +name: datasets datasets: + - type: blended + name: domain_a + datasets: + - type: memmap + path: datasets/domain_a/shard_0.fast_llm_dataset + - type: memmap + path: datasets/domain_a/shard_1.fast_llm_dataset + weights: [1.0, 1.0] - type: memmap - path: my_datasets/dataset1/shard_0_0.fast_llm_dataset - - type: memmap - path: my_datasets/dataset2/shard_0_0.fast_llm_dataset - - type: memmap - path: my_datasets/dataset3/shard_0_0.fast_llm_dataset -weights: - - 1.5 # Dataset 1: 1.5B tokens - - 2.0 # Dataset 2: 2B tokens - - 3.0 # Dataset 3: 3B tokens + path: datasets/domain_b/shard_0.fast_llm_dataset +weights: [2.0, 4.0] # In billions ``` -### 3. Use in Training Config - +Use in training: ```yaml -# training_config.yaml -model: - # ... model config ... - data: datasets: training: type: file - path: blended_datasets.yaml - sampling: - shuffle: skip_first_epoch - seed: 784569 - -# ... rest of training config ... + path: blended.yaml ``` -### 4. Train +## Options -```bash -fast-llm train --config training_config.yaml -``` - -## Use Cases - -### 1. Combining Multiple Data Sources - -You have data from different sources (web scrapes, books, code, etc.) prepared separately: - -```bash -python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml -``` - -### 2. Incremental Data Addition +- **directory**: Root directory to scan (required) +- **output**: Output YAML file path (required) +- **ignore_paths**: Paths to exclude, relative or absolute (optional) -You keep adding new datasets over time and want to automatically include all of them: +## Key Features -```bash -# Just add new prepared datasets to the directory -# Re-run discovery to update the combined config -python tools/discover_datasets.py /data/pretraining -o all_pretraining_data.yaml -``` - -### 3. Experiment Organization - -You have experiments with different preprocessing or filtering: +- **Token-proportional sampling**: Datasets sampled by token count (larger datasets sampled more) +- **Hierarchical grouping**: Directory structure preserved in config +- **Automatic weights**: Calculated from binary file metadata +- **Error handling**: Skips unreadable files with warnings -``` -experiments/ -├── baseline/ -│ └── dataset.fast_llm_dataset -├── filtered_v1/ -│ └── dataset.fast_llm_dataset -└── filtered_v2/ - └── dataset.fast_llm_dataset -``` - -```bash -python tools/discover_datasets.py experiments/ -o all_experiments.yaml -``` - -## Error Handling - -The tool gracefully handles errors: - -- **Missing files**: Warns and skips (returns 0 tokens) -- **Invalid format**: Warns and skips (returns 0 tokens) -- **Read errors**: Warns and skips (returns 0 tokens) -- **Files with 0 tokens**: Excluded from final config with warning - -All warnings are logged so you can see which files were skipped. - -## Implementation Details - -### File Format - -`.fast_llm_dataset` files are binary memmap files with this structure: -``` -[Header: "fast_llm_prepared_dataset"] -[Pointer to config: 8 bytes] -[Data: variable length] -[Config length: 4 bytes] -[Config JSON: variable length] -``` - -The config JSON contains metadata including `num_tokens`, which is used for weighting. - -### Key Methods - -- `_find_dataset_files()` - Recursively find `.fast_llm_dataset` files -- `_read_memmap_num_tokens()` - Read token count from binary file header -- `_create_memmap_config_for_dataset()` - Generate memmap config dict -- `_create_hierarchical_config()` - Build nested blended config tree -- `_create_directory_config()` - Recursively process directory structure -- `_group_files_by_directory()` - Group files by parent directory -- `_build_directory_tree()` - Build parent-child directory relationships - -### Required Imports - -The tool imports several config registries to ensure proper deserialization: -- `fast_llm.data.preprocessing.image_patch` -- `fast_llm.data.preprocessing.language_model` -- `fast_llm.data.sample.language_model` -- `fast_llm.data.sample.patch` -- `fast_llm.data.sample.range` -- `fast_llm.data.sample.token` +## Notes -These are loaded when reading each file to ensure the config types are registered. +- Single datasets returned directly (not wrapped) +- Files with 0 tokens skipped with warning +- Empty directories raise error +- Datasets sorted alphabetically ## Testing -Run tests: - ```bash pytest tests/data/test_dataset_discovery.py ``` - -Tests cover: -- Unit tests for helper methods (file discovery, grouping, tree building) -- End-to-end tests with actual prepared datasets -- Hierarchical directory structures -- Ignore paths functionality -- Error handling (empty directories, invalid files) - -## Notes - -- **Absolute Paths**: The tool uses absolute paths for memmap files to ensure configs work regardless of where they're used from - -- **Ordering**: Datasets are discovered and ordered alphabetically by path for consistency - -- **Single Dataset**: If only one `.fast_llm_dataset` file is found, it's returned directly (not wrapped in a blended config) - -- **Empty Directories**: If no `.fast_llm_dataset` files are found, the tool will raise an error - -- **All Files Included**: The tool discovers ALL `.fast_llm_dataset` files in the directory tree. Use `--ignore` to exclude specific paths if needed - -## Benefits - -- **Automatic weighting**: No manual calculation of token counts needed -- **Hierarchical organization**: Preserves directory structure in config -- **Self-contained**: Reads all metadata from dataset files themselves -- **Simple**: Direct file scanning, no complex YAML parsing -- **Robust**: Binary format is well-defined, graceful error handling -- **Fast**: Direct header reads, no full dataset loading - -## Limitations - -- Only works with `.fast_llm_dataset` files (not arbitrary configs) -- Always generates memmap configs (doesn't support complex config composition) -- No slicing or sampling during discovery (use separate tools for that) - -For most workflows (processing datasets then training on them), these limitations are not relevant. - -## See Also - -- [Fast-LLM Data Configuration Documentation](../../../docs/recipes/data-configuration.md) -- [Dataset Preparation Guide](../../../docs/recipes/data-preparation.md) -- [GPT Memmap Preparator](../gpt_memmap/) From 2987fdb1d3f06520daf25c7e2c840e585d837185 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Tue, 6 Jan 2026 20:25:27 +0000 Subject: [PATCH 11/13] update --- fast_llm/data/auto.py | 1 + fast_llm/data/preparator/dataset_discovery/README.md | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fast_llm/data/auto.py b/fast_llm/data/auto.py index 22ab3d731..4a873458b 100644 --- a/fast_llm/data/auto.py +++ b/fast_llm/data/auto.py @@ -14,4 +14,5 @@ GPTFimSampledDatasetConfig, GPTRandomDatasetConfig, ) +from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig # isort: skip from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig # isort: skip diff --git a/fast_llm/data/preparator/dataset_discovery/README.md b/fast_llm/data/preparator/dataset_discovery/README.md index 70cf6628f..b88347f0d 100644 --- a/fast_llm/data/preparator/dataset_discovery/README.md +++ b/fast_llm/data/preparator/dataset_discovery/README.md @@ -4,11 +4,12 @@ Automatically discover `.fast_llm_dataset` files and generate a blended config w ## Quick Start +Using the tools wrapper: ```bash python tools/discover_datasets.py -o ``` -Or with config file: +Using Fast-LLM CLI with config file: ```yaml type: prepare_dataset_discovery directory: /path/to/datasets @@ -17,7 +18,7 @@ ignore_paths: [test_data, checkpoints] # Optional ``` ```bash -python -m fast_llm.engine.config_utils.run --config config.yaml +python -m fast_llm.cli --config config.yaml ``` ## What It Does From 4941c05368f0b5665dbfd2d26e28005828d22514 Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Fri, 9 Jan 2026 22:58:33 +0000 Subject: [PATCH 12/13] refactor read-reader-config and add auto imports --- fast_llm/data/auto.py | 5 ++++ fast_llm/data/dataset/memmap.py | 25 ++++++++++++------- .../preparator/dataset_discovery/prepare.py | 22 +++------------- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/fast_llm/data/auto.py b/fast_llm/data/auto.py index 4a873458b..d39ce1e4a 100644 --- a/fast_llm/data/auto.py +++ b/fast_llm/data/auto.py @@ -16,3 +16,8 @@ ) from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig # isort: skip from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig # isort: skip +from fast_llm.data.sample.abstract import NullReaderConfig # isort: skip +from fast_llm.data.sample.language_model import LanguageModelReaderConfig # isort: skip +from fast_llm.data.sample.patch import PatchReaderConfig # isort: skip +from fast_llm.data.sample.range import RangeReaderConfig # isort: skip +from fast_llm.data.sample.token import TokenReaderConfig # isort: skip diff --git a/fast_llm/data/dataset/memmap.py b/fast_llm/data/dataset/memmap.py index f80a48b0a..4b62d9d8a 100644 --- a/fast_llm/data/dataset/memmap.py +++ b/fast_llm/data/dataset/memmap.py @@ -18,6 +18,21 @@ class MemmapDataset[SampleType: Sample](IndexedDataset[SampleType]): A memory map dataset, which handles lazy loading of a pre-processed dataset. """ + @staticmethod + def read_reader_config(path: pathlib.Path | str) -> MemmapIndexDatasetReaderConfig: + """ + Read the MemmapIndexDatasetReaderConfig from a memmap file. + """ + path = pathlib.Path(path) if isinstance(path, str) else path + with path.open("rb") as stream: + # Verify file type. + assert stream.read(len(FILE_HEADER)) == FILE_HEADER + # Go to reader configs. + stream.seek(int.from_bytes(stream.read(8), signed=False)) + # Read the reader config. + config_bytes = stream.read(int.from_bytes(stream.read(4), signed=False)) + return MemmapIndexDatasetReaderConfig.from_dict(json.loads(config_bytes.decode("utf-8"))) + def __init__( self, name: str, @@ -32,15 +47,7 @@ def _init(self, name: str, path: pathlib.Path | str, preprocessing: Preprocessin self._path = path self._preprocessing = preprocessing - with self._path.open("rb") as stream: - # Very file type. - assert stream.read(len(FILE_HEADER)) == FILE_HEADER - # Go to reader configs. - stream.seek(int.from_bytes(stream.read(8), signed=False)) - # Read the reader config. - reader_config = MemmapIndexDatasetReaderConfig.from_dict( - json.loads(stream.read(int.from_bytes(stream.read(4), signed=False)).decode("utf-8")) - ) + reader_config = self.read_reader_config(self._path) self._memmap = np.memmap(self._path, mode="r") self._reader = reader_config.get_reader(memoryview(self._memmap), self._preprocessing) diff --git a/fast_llm/data/preparator/dataset_discovery/prepare.py b/fast_llm/data/preparator/dataset_discovery/prepare.py index 42d4f125e..25a29ca3e 100644 --- a/fast_llm/data/preparator/dataset_discovery/prepare.py +++ b/fast_llm/data/preparator/dataset_discovery/prepare.py @@ -5,13 +5,13 @@ and reading token counts from their binary headers. """ -import json import logging import pathlib from collections import defaultdict import yaml +from fast_llm.data.dataset.memmap import MemmapDataset from fast_llm.data.preparator.config import DatasetPreparator from fast_llm.data.preparator.dataset_discovery.config import DatasetDiscoveryConfig @@ -117,30 +117,14 @@ def _find_dataset_files( @staticmethod def _read_memmap_num_tokens(memmap_path: pathlib.Path) -> int: """Read number of tokens from a .fast_llm_dataset memmap file.""" - # Import preprocessing and sample configs to register them - import fast_llm.data.preprocessing.image_patch # noqa - import fast_llm.data.preprocessing.language_model # noqa - import fast_llm.data.sample.language_model # noqa - import fast_llm.data.sample.patch # noqa - import fast_llm.data.sample.range # noqa - import fast_llm.data.sample.token # noqa - from fast_llm.data.dataset.memmap import FILE_HEADER - from fast_llm.data.sample.abstract import MemmapIndexDatasetReaderConfig if not memmap_path.exists(): logger.warning(f"Memmap file not found: {memmap_path}") return 0 try: - with memmap_path.open("rb") as stream: - header = stream.read(len(FILE_HEADER)) - if header != FILE_HEADER: - logger.warning(f"Invalid memmap file format: {memmap_path}") - return 0 - stream.seek(int.from_bytes(stream.read(8), signed=False)) - config_bytes = stream.read(int.from_bytes(stream.read(4), signed=False)) - reader_config = MemmapIndexDatasetReaderConfig.from_dict(json.loads(config_bytes.decode("utf-8"))) - return reader_config.num_tokens + reader_config = MemmapDataset.read_reader_config(memmap_path) + return reader_config.num_tokens except Exception as e: logger.warning(f"Failed to read memmap file {memmap_path}: {e}") return 0 From 10f71d725745dbe066d62fad15916096c8484cbd Mon Sep 17 00:00:00 2001 From: Toolkit User Date: Fri, 9 Jan 2026 23:01:02 +0000 Subject: [PATCH 13/13] rm tool --- tools/discover_datasets.py | 56 -------------------------------------- 1 file changed, 56 deletions(-) delete mode 100644 tools/discover_datasets.py diff --git a/tools/discover_datasets.py b/tools/discover_datasets.py deleted file mode 100644 index 0744ec10c..000000000 --- a/tools/discover_datasets.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Tool to recursively discover datasets in a directory and generate a blended dataset config. - -This tool is a command-line wrapper around the DatasetDiscoveryPreparator. -For programmatic usage, use fast_llm.data.preparator.dataset_discovery directly. -""" - -import argparse -import logging -import pathlib - -from fast_llm.data.preparator.dataset_discovery import DatasetDiscoveryConfig - - -def main(): - """ - Command-line entry point. - """ - parser = argparse.ArgumentParser(description="Discover datasets and generate hierarchical blended config") - parser.add_argument("directory", type=pathlib.Path, help="Directory to search for datasets recursively") - parser.add_argument( - "-o", "--output", type=pathlib.Path, required=True, help="Output path for the generated config YAML file" - ) - parser.add_argument("--no-file-refs", action="store_true", help="Inline configs instead of using file references") - parser.add_argument( - "--ignore", - type=pathlib.Path, - action="append", - dest="ignore_paths", - help="Path to ignore during dataset discovery (can be specified multiple times)", - ) - - args = parser.parse_args() - - # Configure logging - logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") - - # Create and run the config - config = DatasetDiscoveryConfig( - directory=args.directory, - output=args.output, - use_file_refs=not args.no_file_refs, - ignore_paths=args.ignore_paths or [], - ) - config.run() - - -if __name__ == "__main__": - # Support both CLI usage and Fast-LLM's config system - import sys - - # Check if using argparse-style CLI (positional arg without --config) - if len(sys.argv) > 1 and not sys.argv[1].startswith("-") and sys.argv[1] != "--config": - main() - else: - DatasetDiscoveryConfig.parse_and_run()