diff --git a/.gitignore b/.gitignore index cdffe91..584d441 100644 --- a/.gitignore +++ b/.gitignore @@ -24,7 +24,6 @@ __pycache__/ # Distribution / packaging .Python -env/ build/ develop-eggs/ dist/ @@ -37,6 +36,7 @@ parts/ sdist/ var/ wheels/ +share/python-wheels/ *.egg-info/ .installed.cfg *.egg @@ -55,13 +55,17 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ +.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml -*,cover +*.cover +*.py,cover .hypothesis/ +.pytest_cache/ +cover/ # Translations *.mo @@ -70,6 +74,8 @@ coverage.xml # Django stuff: *.log local_settings.py +db.sqlite3 +db.sqlite3-journal # Flask stuff: instance/ @@ -82,32 +88,101 @@ instance/ docs/_build/ # PyBuilder +.pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints +# IPython +profile_default/ +ipython_config.py + # pyenv .python-version -# celery beat schedule file +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff celerybeat-schedule +celerybeat.pid -# dotenv -.env +# SageMath parsed files +*.sage.py -# virtualenv -.venv/ +# Environments +.env +.venv +env/ venv/ ENV/ +env.bak/ +venv.bak/ # Spyder project settings .spyderproject +.spyproject # Rope project settings .ropeproject -# vscode -.vscode/ -.vs/ -© 2019 GitHub, Inc. +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc diff --git a/deeplc/__init__.py b/deeplc/__init__.py index 953aa36..e4fb659 100644 --- a/deeplc/__init__.py +++ b/deeplc/__init__.py @@ -1,14 +1,8 @@ -import sys +__all__ = ["DeepLC"] +from importlib.metadata import version -if sys.version_info >= (3,8): - from importlib.metadata import version - __version__ = version('deeplc') -else: - import pkg_resources - __version__ = pkg_resources.require("deeplc")[0].version +__version__ = version("deeplc") from deeplc.deeplc import DeepLC -from deeplc.feat_extractor import FeatExtractor - diff --git a/deeplc/__main__.py b/deeplc/__main__.py index eb6079f..e76278b 100644 --- a/deeplc/__main__.py +++ b/deeplc/__main__.py @@ -1,7 +1,12 @@ """Main command line interface to DeepLC.""" __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__credits__ = ["Robbin Bouwmeester", "Ralf Gabriels", "Prof. Lennart Martens", "Sven Degroeve"] +__credits__ = [ + "Robbin Bouwmeester", + "Ralf Gabriels", + "Prof. Lennart Martens", + "Sven Degroeve", +] __license__ = "Apache License, Version 2.0" __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] __email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"] @@ -12,12 +17,12 @@ import warnings import pandas as pd +from psm_utils.io import read_file from psm_utils.io.peptide_record import peprec_to_proforma from psm_utils.psm import PSM from psm_utils.psm_list import PSMList -from psm_utils.io import read_file -from deeplc import __version__, DeepLC, FeatExtractor +from deeplc import DeepLC, __version__ from deeplc._argument_parser import parse_arguments from deeplc._exceptions import DeepLCError @@ -26,27 +31,28 @@ def setup_logging(passed_level): log_mapping = { - 'critical': logging.CRITICAL, - 'error': logging.ERROR, - 'warning': logging.WARNING, - 'info': logging.INFO, - 'debug': logging.DEBUG, + "critical": logging.CRITICAL, + "error": logging.ERROR, + "warning": logging.WARNING, + "info": logging.INFO, + "debug": logging.DEBUG, } if passed_level.lower() not in log_mapping: print( "Invalid log level. Should be one of the following: ", - ', '.join(log_mapping.keys()) + ", ".join(log_mapping.keys()), ) exit(1) logging.basicConfig( stream=sys.stdout, - format='%(asctime)s - %(levelname)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=log_mapping[passed_level.lower()] + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=log_mapping[passed_level.lower()], ) + def main(gui=False): """Main function for the CLI.""" argu = parse_arguments(gui=gui) @@ -55,13 +61,13 @@ def main(gui=False): # Reset logging levels if DEBUG (see deeplc.py) if argu.log_level.lower() == "debug": - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0' - logging.getLogger('tensorflow').setLevel(logging.DEBUG) - warnings.filterwarnings('default', category=DeprecationWarning) - warnings.filterwarnings('default', category=FutureWarning) - warnings.filterwarnings('default', category=UserWarning) + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0" + logging.getLogger("tensorflow").setLevel(logging.DEBUG) + warnings.filterwarnings("default", category=DeprecationWarning) + warnings.filterwarnings("default", category=FutureWarning) + warnings.filterwarnings("default", category=UserWarning) else: - os.environ['KMP_WARNINGS'] = '0' + os.environ["KMP_WARNINGS"] = "0" try: run(**vars(argu)) @@ -101,13 +107,13 @@ def run( for fm in file_model: if len(sel_group) == 0: sel_group = "_".join(fm.split("_")[:-1]) - fm_dict[sel_group]= fm + fm_dict[sel_group] = fm continue m_group = "_".join(fm.split("_")[:-1]) if m_group == sel_group: fm_dict[m_group] = fm file_model = fm_dict - + with open(file_pred) as f: first_line_pred = f.readline().strip() if file_cal: @@ -118,53 +124,68 @@ def run( # Read input files df_pred = pd.read_csv(file_pred) if len(df_pred.columns) < 2: - df_pred = pd.read_csv(file_pred,sep=" ") + df_pred = pd.read_csv(file_pred, sep=" ") df_pred = df_pred.fillna("") file_pred = "" list_of_psms = [] - for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident)) + for seq, mod, ident in zip(df_pred["seq"], df_pred["modifications"], df_pred.index): + list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident)) psm_list_pred = PSMList(psm_list=list_of_psms) df_pred = None else: psm_list_pred = read_file(file_pred) if "msms" in file_pred and ".txt" in file_pred: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list_pred.rename_modifications(mapper) # Allow for calibration file to be empty (undefined), fill in if/elif if present psm_list_cal = [] - if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal: + if ( + "modifications" in first_line_cal.split(",") + and "seq" in first_line_cal.split(",") + and file_cal + ): df_cal = pd.read_csv(file_cal) if len(df_cal.columns) < 2: - df_cal = pd.read_csv(df_cal,sep=" ") + df_cal = pd.read_csv(df_cal, sep=" ") df_cal = df_cal.fillna("") file_cal = "" list_of_psms = [] - for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]): - list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr)) + for seq, mod, ident, tr in zip( + df_cal["seq"], df_cal["modifications"], df_cal.index, df_cal["tr"] + ): + list_of_psms.append( + PSM( + peptidoform=peprec_to_proforma(seq, mod), + spectrum_id=ident, + retention_time=tr, + ) + ) psm_list_cal = PSMList(psm_list=list_of_psms) df_cal = None elif file_cal: psm_list_cal = read_file(file_cal) if "msms" in file_cal and ".txt" in file_cal: - mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict() + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() psm_list_cal.rename_modifications(mapper) - # Make a feature extraction object; you can skip this if you do not want to - # use the default settings for DeepLC. Here we want to use a model that does - # not use RDKit features so we skip the chemical descriptor making - # procedure. - f_extractor = FeatExtractor( - cnn_feats=True, - verbose=verbose - ) - + # Make the DeepLC object that will handle making predictions and calibration dlc = DeepLC( path_model=file_model, - f_extractor=f_extractor, cnn_model=True, split_cal=split_cal, dict_cal_divider=dict_divider, @@ -173,9 +194,9 @@ def run( batch_num=batch_num, n_jobs=n_threads, verbose=verbose, - deeplc_retrain=transfer_learning + deeplc_retrain=transfer_learning, ) - + # Calibrate the original model based on the new retention times if len(psm_list_cal) > 0: logger.info("Selecting best model and calibrating predictions...") @@ -185,16 +206,21 @@ def run( # Make predictions; calibrated or uncalibrated logger.info("Making predictions using model: %s", dlc.model) if len(psm_list_cal) > 0: - preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) + preds = dlc._make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred) else: - preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False) - - #df_pred["predicted_tr"] = preds + preds = dlc._make_preds( + seq_df=df_pred, + infile=file_pred, + psm_list=psm_list_pred, + calibrate=False, + ) + + # df_pred["predicted_tr"] = preds logger.info("Writing predictions to file: %s", file_pred_out) - - file_pred_out = open(file_pred_out,"w") + + file_pred_out = open(file_pred_out, "w") file_pred_out.write("Sequence proforma,predicted retention time\n") - for psm,tr in zip(psm_list_pred,preds): + for psm, tr in zip(psm_list_pred, preds): file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n") file_pred_out.close() diff --git a/deeplc/_argument_parser.py b/deeplc/_argument_parser.py index 09875cf..e12f526 100644 --- a/deeplc/_argument_parser.py +++ b/deeplc/_argument_parser.py @@ -74,17 +74,13 @@ def parse_arguments(gui=False): parser = ArgumentParser( prog="DeepLC", - description=( - "Retention time prediction for (modified) peptides using deep " "learning." - ), + description=("Retention time prediction for (modified) peptides using deep learning."), usage="deeplc [OPTIONS] --file_pred ", formatter_class=lambda prog: HelpFormatter(prog, max_help_position=42), add_help=False, ) - io_args = parser.add_argument_group( - "Input and output files", **gooey_args["io_args"] - ) + io_args = parser.add_argument_group("Input and output files", **gooey_args["io_args"]) io_args.add_argument( "--file_pred", required=True, @@ -97,9 +93,7 @@ def parse_arguments(gui=False): type=str, default=None, metavar="Input peptides for calibration" if gui else "", - help=( - "path to peptide CSV file with retention times to use for " "calibration" - ), + help=("path to peptide CSV file with retention times to use for calibration"), **gooey_args["file_cal"], ) io_args.add_argument( @@ -166,10 +160,7 @@ def parse_arguments(gui=False): dest="split_cal", default=50, metavar="split cal" if gui else "", - help=( - "number of splits in the chromatogram for piecewise linear " - "calibration fit" - ), + help=("number of splits in the chromatogram for piecewise linear calibration fit"), **gooey_args["split_cal"], ) model_cal_args.add_argument( @@ -265,8 +256,6 @@ def parse_arguments(gui=False): results = parser.parse_args() if not results.file_pred_out: - results.file_pred_out = ( - os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv" - ) + results.file_pred_out = os.path.splitext(results.file_pred)[0] + "_deeplc_predictions.csv" return results diff --git a/deeplc/_exceptions.py b/deeplc/_exceptions.py index 1be41ae..c3cde3a 100644 --- a/deeplc/_exceptions.py +++ b/deeplc/_exceptions.py @@ -1,5 +1,6 @@ """DeepLC exceptions.""" + class DeepLCError(Exception): pass diff --git a/deeplc/deeplc.py b/deeplc/deeplc.py index 589c040..22f1b92 100644 --- a/deeplc/deeplc.py +++ b/deeplc/deeplc.py @@ -4,6 +4,8 @@ This provides the main interface. For the library versions see the .yml file """ +from __future__ import annotations + __author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] __license__ = "Apache License, Version 2.0" __maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] @@ -12,192 +14,351 @@ "Robbin Bouwmeester", "Ralf Gabriels", "Arthur Declercq", + "Alireza Nameni", "Lennart Martens", "Sven Degroeve", ] - # Default models, will be used if no other is specified. If no best model is # selected during calibration, the first model in the list will be used. -import os - -deeplc_dir = os.path.dirname(os.path.realpath(__file__)) -DEFAULT_MODELS = [ - "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras", - "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras", - "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras", -] -DEFAULT_MODELS = [os.path.join(deeplc_dir, dm) for dm in DEFAULT_MODELS] - -LIBRARY = {} - import copy import gc import logging -import math import multiprocessing import multiprocessing.dummy +import os import random import sys import warnings from configparser import ConfigParser -from itertools import chain +from pathlib import Path from tempfile import TemporaryDirectory -from sklearn.preprocessing import SplineTransformer +import numpy as np +import pandas as pd +import torch +from dask import compute, delayed +from psm_utils import PSM, Peptidoform, PSMList +from psm_utils.io import read_file +from psm_utils.io.peptide_record import peprec_to_proforma +from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import SplineTransformer +from torch.utils.data import DataLoader, Dataset +from deeplc._exceptions import CalibrationError +from deeplc.feat_extractor import aggregate_encodings, encode_peptidoform +from deeplc.trainl3 import train_elastic_net -# If CLI/GUI/frozen: disable Tensorflow info and warnings before importing +# If CLI/GUI/frozen: disable warnings before importing IS_CLI_GUI = os.path.basename(sys.argv[0]) in ["deeplc", "deeplc-gui"] IS_FROZEN = getattr(sys, "frozen", False) if IS_CLI_GUI or IS_FROZEN: - os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" - logging.getLogger("tensorflow").setLevel(logging.CRITICAL) - warnings.filterwarnings("ignore", category=DeprecationWarning) - warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=UserWarning) -# Supress warnings (or at least try...) -logging.getLogger("tensorflow").setLevel(logging.ERROR) -os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" - -import numpy as np -import pandas as pd -import tensorflow as tf -from deeplcretrainer import deeplcretrainer -from psm_utils.io import read_file -from psm_utils.io.peptide_record import peprec_to_proforma -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList +DEEPLC_DIR = os.path.dirname(os.path.realpath(__file__)) +DEFAULT_MODELS = [ + "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.pt", + "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.pt", + "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.pt", +] +DEFAULT_MODELS = [os.path.join(DEEPLC_DIR, dm) for dm in DEFAULT_MODELS] -try: - from tensorflow.keras.models import load_model -except: - from tensorflow.python.keras.models import load_model -from tensorflow.python.eager import context +LIBRARY = {} -from deeplc._exceptions import CalibrationError -from deeplc.trainl3 import train_en +# TODO: Keep for torch? # Set to force CPU calculations os.environ["CUDA_VISIBLE_DEVICES"] = "-1" -# Feature extraction -from deeplc.feat_extractor import FeatExtractor +logger = logging.getLogger(__name__) -def warn(*args, **kwargs): - pass +def split_list(a, n): + k, m = divmod(len(a), n) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) -import warnings -warnings.warn = warn +def divide_chunks(list_, n_chunks): + """Yield successive n-sized chunks from list_.""" + for i in range(0, len(list_), n_chunks): + yield list_[i : i + n_chunks] -warnings.filterwarnings("ignore", category=DeprecationWarning) -warnings.filterwarnings("ignore", category=FutureWarning) -logger = logging.getLogger(__name__) +class DeepLCDataset(Dataset): + """ + Custom Dataset class for DeepLC used for loading features from peptide sequences. + Parameters + ---------- + X : ndarray + Feature matrix for input data. + X_sum : ndarray + Feature matrix for sum of input data. + X_global : ndarray + Feature matrix for global input data. + X_hc : ndarray + Feature matrix for high-order context features. + target : ndarray, optional + The target retention times. Default is None. + """ -def split_list(a, n): - k, m = divmod(len(a), n) - return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + def __init__(self, X, X_sum, X_global, X_hc, target=None): + self.X = torch.from_numpy(X).float() + self.X_sum = torch.from_numpy(X_sum).float() + self.X_global = torch.from_numpy(X_global).float() + self.X_hc = torch.from_numpy(X_hc).float() + + if target is not None: + self.target = torch.from_numpy( + target + ).float() # Add target values if provided + else: + self.target = None # If no target is provided, set it to None + + def __len__(self): + return self.X.shape[0] + + def __getitem__(self, idx): + if self.target is not None: + # Return both features and target during training + return ( + self.X[idx], + self.X_sum[idx], + self.X_global[idx], + self.X_hc[idx], + self.target[idx], + ) + else: + # Return only features during prediction + return (self.X[idx], self.X_sum[idx], self.X_global[idx], self.X_hc[idx]) -def divide_chunks(l, n): - for i in range(0, len(l), n): - yield l[i : i + n] +class DeepLCFineTuner: + """ + Class for fine-tuning a DeepLC model. + + Parameters + ---------- + model : torch.nn.Module + The model to fine-tune. + train_data : torch.utils.data.Dataset + Dataset containing the training data. + device : str, optional, default='cpu' + The device on which to run the model ('cpu' or 'cuda'). + learning_rate : float, optional, default=0.001 + The learning rate for the optimizer. + epochs : int, optional, default=10 + Number of training epochs. + batch_size : int, optional, default=256 + Batch size for training. + validation_data : torch.utils.data.Dataset or None, optional + If provided, used directly for validation. Otherwise, a fraction of + `train_data` will be held out. + validation_split : float, optional, default=0.1 + Fraction of `train_data` to reserve for validation when + `validation_data` is None. + patience : int, optional, default=5 + Number of epochs with no improvement on validation loss before stopping. + """ + def __init__( + self, + model, + train_data, + device="cpu", + learning_rate=0.001, + epochs=10, + batch_size=256, + validation_data=None, + validation_split=0.1, + patience=5, + ): + self.model = model.to(device) + self.train_data = train_data + self.device = device + self.learning_rate = learning_rate + self.epochs = epochs + self.batch_size = batch_size + self.validation_data = validation_data + self.validation_split = validation_split + self.patience = patience + + def _freeze_layers(self, unfreeze_keywords="33_1"): + """ + Freezes all layers except those that contain the unfreeze_keyword + in their name. + """ + + for name, param in self.model.named_parameters(): + param.requires_grad = unfreeze_keywords in name + + def prepare_data(self, data, shuffle=True): + return DataLoader(data, batch_size=self.batch_size, shuffle=shuffle) + + def fine_tune(self): + logger.debug("Starting fine-tuning...") + if self.validation_data is None: + # Split the training data into training and validation sets + val_size = int(len(self.train_data) * self.validation_split) + train_size = len(self.train_data) - val_size + train_dataset, val_dataset = torch.utils.data.random_split( + self.train_data, [train_size, val_size] + ) + else: + train_dataset = self.train_data + val_dataset = self.validation_data + train_loader = self.prepare_data(train_dataset) + val_loader = self.prepare_data(val_dataset, shuffle=False) + + optimizer = torch.optim.Adam( + filter(lambda p: p.requires_grad, self.model.parameters()), + lr=self.learning_rate, + ) + loss_fn = torch.nn.L1Loss() + best_model_wts = copy.deepcopy(self.model.state_dict()) + best_val_loss = float("inf") + epochs_no_improve = 0 + + for epoch in range(self.epochs): + running_loss = 0.0 + self.model.train() + for batch in train_loader: + batch_X, batch_X_sum, batch_X_global, batch_X_hc, target = batch + + target = target.view(-1, 1) + + optimizer.zero_grad() + outputs = self.model(batch_X, batch_X_sum, batch_X_global, batch_X_hc) + loss = loss_fn(outputs, target) + loss.backward() + optimizer.step() + running_loss += loss.item() + avg_loss = running_loss / len(train_loader) + + self.model.eval() + val_loss = 0.0 + with torch.no_grad(): + for batch in val_loader: + batch_X, batch_X_sum, batch_X_global, batch_X_hc, target = batch + target = target.view(-1, 1) + outputs = self.model( + batch_X, batch_X_sum, batch_X_global, batch_X_hc + ) + val_loss += loss_fn(outputs, target).item() + avg_val_loss = val_loss / len(val_loader) -def reset_keras(): - """Reset Keras session.""" - # sess = get_session() - # clear_session() - # sess.close() - # gc.collect() - # Set to force CPU calculations - os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + logger.debug( + f"Epoch {epoch + 1}/{self.epochs}, " + f"Loss: {avg_loss:.4f}, " + f"Validation Loss: {avg_val_loss:.4f}" + ) + if avg_val_loss < best_val_loss: + best_val_loss = avg_val_loss + best_model_wts = copy.deepcopy(self.model.state_dict()) + epochs_no_improve = 0 + else: + epochs_no_improve += 1 + if epochs_no_improve >= self.patience: + logger.debug(f"Early stopping triggered {epoch + 1}") + break + self.model.load_state_dict(best_model_wts) + return self.model class DeepLC: """ DeepLC predictor. - Parameters - ---------- - main_path : str - main path of module - path_model : str, optional - path to prediction model(s); leave empty to select the best of the - default models based on the calibration peptides - verbose : bool, default=True - turn logging on/off - bin_dist : float, default=2 - TODO - dict_cal_divider : int, default=50 - sets precision for fast-lookup of retention times for calibration; e.g. - 10 means a precision of 0.1 between the calibration anchor points - split_cal : int, default=50 - number of splits in the chromatogram for piecewise linear calibration - fit - n_jobs : int, optional - number of CPU threads to use - config_file : str, optional - path to configuration file - f_extractor : object :: deeplc.FeatExtractor, optional - deeplc.FeatExtractor object to use - cnn_model : bool, default=True - use CNN model or not - batch_num : int, default=250000 - prediction batch size (in peptides); lower to decrease memory footprint - write_library : bool, default=False - append new predictions to library for faster future results; requires - `use_library` option - use_library : str, optional - library file with previous predictions for faster results to read from, - or to write to - reload_library : bool, default=False - reload prediction library - Methods ------- - calibrate_preds(seqs=[], mods=[], identifiers=[], measured_tr=[], correction_factor=1.0, seq_df=None, use_median=True) + calibrate_preds Find best model and calibrate - make_preds(seqs=[], mods=[], identifiers=[], calibrate=True, seq_df=None, correction_factor=1.0, mod_name=None) + make_preds Make predictions """ library = {} - # TODO have a CCS flag here def __init__( self, - main_path=os.path.dirname(os.path.realpath(__file__)), - path_model=None, - verbose=True, - bin_dist=2, - dict_cal_divider=50, - split_cal=50, - n_jobs=None, - config_file=None, - f_extractor=None, - cnn_model=True, - batch_num=250000, - batch_num_tf=1024, - write_library=False, - use_library=None, - reload_library=False, - pygam_calibration=True, - deepcallc_mod=False, - deeplc_retrain=False, - predict_ccs=False, - n_epochs=20, - single_model_mode=True, + main_path: str | None = None, + path_model: str | None = None, + verbose: bool = True, + bin_distance: float = 2.0, + dict_cal_divider: int = 50, + split_cal: int = 50, + n_jobs: int | None = None, + config_file: str | None = None, + f_extractor: None = None, + cnn_model: bool = True, + # batch_num: int = 250000, + batch_num: int = int(1e6), + batch_num_tf: int = 1024, + write_library: bool = False, + use_library: str | None = None, + reload_library: bool = False, + pygam_calibration: bool = True, + deepcallc_mod: bool = False, + deeplc_retrain: bool = False, + predict_ccs: bool = False, + n_epochs: int = 20, + single_model_mode: bool = True, ): + """ + Initialize the DeepLC predictor. + + Parameters + ---------- + main_path + Main path of the module. + path_model + Path to prediction model(s); if not provided, the best default model is selected based + on calibration peptides. + verbose + Turn logging on/off. + bin_dist + Precision factor for calibration lookup. + dict_cal_divider + Divider that sets the precision for fast lookup of retention times in calibration; e.g. + 10 means a precision of 0.1 between the calibration anchor points + split_cal + Number of splits in the chromatogram for piecewise linear calibration. + n_jobs + Number of CPU threads to use. + config_file + Path to a configuration file. + f_extractor + Deprecated. + cnn_model + Flag indicating whether to use the CNN model. + batch_num + Prediction batch size (in peptides); lower values reduce memory footprint. + batch_num_tf + Batch size for TensorFlow predictions. + write_library + Whether to append new predictions to a library for faster future access. + use_library + Library file to read from or write to for prediction caching. + reload_library + Whether to reload the prediction library. + pygam_calibration + Flag to enable calibration using PyGAM. + deepcallc_mod + Flag specific to deepcallc mode. + deeplc_retrain + Flag indicating whether to perform retraining (transfer learning) of prediction models. + predict_ccs + Flag to control prediction of CCS values. + n_epochs + Number of epochs used in retraining if deeplc_retrain is enabled. + single_model_mode + Flag to use a single model instead of multiple default models. + + """ # if a config file is defined overwrite standard parameters if config_file: cparser = ConfigParser() @@ -206,70 +367,40 @@ def __init__( split_cal = cparser.getint("DeepLC", "split_cal") n_jobs = cparser.getint("DeepLC", "n_jobs") - self.main_path = main_path + self.main_path = main_path or os.path.dirname(os.path.realpath(__file__)) + self.path_model = self._get_model_paths(path_model, single_model_mode) self.verbose = verbose - self.bin_dist = bin_dist - self.calibrate_dict = {} - self.calibrate_min = float("inf") - self.calibrate_max = 0 - self.n_epochs = n_epochs + self.bin_distance = bin_distance + self.dict_cal_divider = dict_cal_divider + self.split_cal = split_cal + self.n_jobs = multiprocessing.cpu_count() if n_jobs is None else n_jobs + self.config_file = config_file self.cnn_model = cnn_model - + self.model_cache = {} self.batch_num = batch_num self.batch_num_tf = batch_num_tf - self.dict_cal_divider = dict_cal_divider - self.split_cal = split_cal - self.n_jobs = n_jobs - - ################################################ - # # - # !!!WARNING!!! # - # # - #! DUE TO ISSUES WITH MULTITHREADING SET TO 1 !# - # # - # # - ################################################ - - self.n_jobs = 1 - - if self.n_jobs == None: - max_threads = multiprocessing.cpu_count() - self.n_jobs = max_threads - - self.use_library = use_library self.write_library = write_library - + self.use_library = use_library self.reload_library = reload_library + self.pygam_calibration = pygam_calibration + self.deepcallc_mod = deepcallc_mod + self.deeplc_retrain = deeplc_retrain + self.predict_ccs = predict_ccs + self.n_epochs = n_epochs - try: - tf.config.threading.set_intra_op_parallelism_threads(n_jobs) - except RuntimeError: - logger.warning( - "DeepLC tried to set intra op threads, but was unable to do so." - ) - - if "NUMEXPR_MAX_THREADS" not in os.environ: - os.environ["NUMEXPR_MAX_THREADS"] = str(n_jobs) - - if path_model: - self.model = path_model - else: - if single_model_mode: - self.model = [DEFAULT_MODELS[0]] - else: - self.model = DEFAULT_MODELS + # Apparently... + self.model = self.path_model if f_extractor: - self.f_extractor = f_extractor - else: - self.f_extractor = FeatExtractor() - - self.pygam_calibration = pygam_calibration - self.deeplc_retrain = deeplc_retrain + warnings.DeprecationWarning("f_extractor argument is deprecated.") - self.deepcallc_mod = deepcallc_mod + # TODO REMOVE!!! + self.verbose = True - self.predict_ccs = predict_ccs + # Calibration variables + self.calibrate_dict = {} + self.calibrate_min = float("inf") + self.calibrate_max = 0 if self.deepcallc_mod: self.write_library = False @@ -288,214 +419,108 @@ def __str__(self): |_| """ - def do_f_extraction(self, seqs, mods, identifiers, charges=[]): - """ - Extract all features we can extract; without parallelization; use if you - want to run feature extraction with a single core + @staticmethod + def _get_model_paths( + passed_model_path: str | None, single_model_mode: bool + ) -> list[str]: + """Get the model paths based on the passed model path and the single model mode.""" + if passed_model_path: + return [passed_model_path] - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods + if single_model_mode: + return [DEFAULT_MODELS[0]] - Returns - ------- - pd.DataFrame - feature matrix - """ - list_of_psms = [] + return DEFAULT_MODELS - if not self.predict_ccs: - for seq, mod, ident in zip(seqs, mods, identifiers): - list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) - ) - else: - for seq, mod, ident, z in zip(seqs, mods, identifiers, charges): - list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod, z), spectrum_id=ident) - ) - - psm_list = PSMList(psm_list=list_of_psms) - - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs - ) - - def do_f_extraction_pd(self, df_instances, charges=[]): + def _prepare_feature_matrices(self, psm_list): """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe + Extract features in parallel and assemble the four input matrices. Parameters ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + psm_list : list of PSM + List of peptide‐spectrum matches for which to extract features. Returns ------- - pd.DataFrame - feature matrix + X : ndarray, shape (n_peptides, n_features) + X_sum : ndarray, shape (n_peptides, n_sum_features) + X_global : ndarray, shape (n_peptides, n_global_features * 2) + X_hc : ndarray, shape (n_peptides, n_hc_features) """ - - list_of_psms = [] - if len(charges) == 0: - for seq, mod, ident in zip( - df_instances["seq"], df_instances["modifications"], df_instances.index - ): - list_of_psms.append( - PSM(peptide=peprec_to_proforma(seq, mod), spectrum_id=ident) - ) - else: - for seq, mod, ident, z in zip( - df_instances["seq"], - df_instances["modifications"], - df_instances.index, - charges=df_instances["charges"], - ): - list_of_psms.append( - PSM( - peptide=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - ) - ) - - psm_list = PSMList(psm_list=list_of_psms) - - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs + feats = self.do_f_extraction_psm_list_parallel(psm_list) + X = np.stack(list(feats["matrix"].values())) + X_sum = np.stack(list(feats["matrix_sum"].values())) + X_global = np.concatenate( + ( + np.stack(list(feats["matrix_all"].values())), + np.stack(list(feats["pos_matrix"].values())), + ), + axis=1, ) + X_hc = np.stack(list(feats["matrix_hc"].values())) + return X, X_sum, X_global, X_hc - def do_f_extraction_pd_parallel(self, df_instances): - """ - Extract all features we can extract; with parallelization; use if you - want to run feature extraction with multiple threads; and use a defined - dataframe - - Parameters - ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) - - Returns - ------- - pd.DataFrame - feature matrix - """ - # self.n_jobs = 1 - - df_instances_split = np.array_split(df_instances, math.ceil(self.n_jobs / 4.0)) - if multiprocessing.current_process().daemon: - logger.warning( - "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." - ) - pool = multiprocessing.dummy.Pool(1) - else: - pool = multiprocessing.Pool(math.ceil(self.n_jobs / 4.0)) + def _extract_features( + self, + peptidoforms: list[str | Peptidoform] | PSMList, + chunk_size: int = 10000, + ) -> dict[str, dict[int, np.ndarray]]: + """Extract features for all peptidoforms.""" + if isinstance(peptidoforms, PSMList): + peptidoforms = [psm.peptidoform for psm in peptidoforms] + + logger.debug("Running feature extraction in single-threaded mode...") + if self.n_jobs <= 1: + encodings = [ + encode_peptidoform(pf, predict_ccs=self.predict_ccs) + for pf in peptidoforms + ] - if self.n_jobs == 1: - df = self.do_f_extraction_pd(df_instances) else: - df = pd.concat(pool.map(self.do_f_extraction_pd, df_instances_split)) - pool.close() - pool.join() - return df - - def do_f_extraction_psm_list(self, psm_list): - """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe - - Parameters - ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) - - Returns - ------- - pd.DataFrame - feature matrix - """ - return self.f_extractor.full_feat_extract( - psm_list, predict_ccs=self.predict_ccs - ) - - def do_f_extraction_psm_list_parallel(self, psm_list): - """ - Extract all features we can extract; without parallelization; use if - you want to run feature extraction with a single thread; and use a - defined dataframe - - Parameters - ---------- - df_instances : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index) + logger.debug("Preparing feature extraction with Dask") + # Process peptidoforms in larger chunks to reduce task overhead. + peptidoform_strings = [ + str(pep) for pep in peptidoforms + ] # Faster pickling of strings + + def chunked_encode(chunk): + return [ + encode_peptidoform(pf, predict_ccs=self.predict_ccs) for pf in chunk + ] + + tasks = [ + delayed(chunked_encode)(peptidoform_strings[i : i + chunk_size]) + for i in range(0, len(peptidoform_strings), chunk_size) + ] - Returns - ------- - pd.DataFrame - feature matrix - """ - # TODO for multiproc I am still expecting a pd dataframe, this is not the case anymore, they are dicts - # self.n_jobs = 1 - logger.debug("prepare feature extraction") - if multiprocessing.current_process().daemon: - logger.warning( - "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic processes can't have children." - ) - psm_list_split = split_list(psm_list, self.n_jobs) - pool = multiprocessing.dummy.Pool(1) - elif self.n_jobs > 1: - psm_list_split = split_list(psm_list, self.n_jobs) - pool = multiprocessing.Pool(self.n_jobs) - - if self.n_jobs == 1: - logger.debug("start feature extraction") - all_feats = self.do_f_extraction_psm_list(psm_list) - logger.debug("got feature extraction results") - else: - logger.debug("start feature extraction") - all_feats_async = pool.map_async( - self.do_f_extraction_psm_list, psm_list_split + logger.debug("Starting feature extraction with Dask") + chunks_encodings = compute( + *tasks, scheduler="processes", workers=self.n_jobs ) - logger.debug("wait for feature extraction") - all_feats_async.wait() - logger.debug("get feature extraction results") - res = all_feats_async.get() - matrix_names = res[0].keys() - all_feats = { - matrix_name: dict( - enumerate( - chain.from_iterable((v[matrix_name].values() for v in res)) - ) - ) - for matrix_name in matrix_names - } + # Flatten the list of lists. + encodings = [enc for chunk in chunks_encodings for enc in chunk] - # all_feats = pd.concat(all_feats_async.get()) + # Aggregate the encodings into a single dictionary. + aggregated_encodings = aggregate_encodings(encodings) - logger.debug("got feature extraction results") + logger.debug("Finished feature extraction") - pool.close() - pool.join() + return aggregated_encodings - return all_feats + def _apply_calibration_core( + self, + uncal_preds: np.ndarray, + cal_dict: dict | list[BaseEstimator], + cal_min: float, + cal_max: float, + ) -> np.ndarray: + """Apply calibration to the predictions.""" + if len(uncal_preds) == 0: + return np.array([]) - def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): cal_preds = [] - if len(uncal_preds) == 0: - return np.array(cal_preds) if self.pygam_calibration: linear_model_left, spline_model, linear_model_right = cal_dict y_pred_spline = spline_model.predict(uncal_preds.reshape(-1, 1)) @@ -521,334 +546,247 @@ def calibration_core(self, uncal_preds, cal_dict, cal_min, cal_max): else: for uncal_pred in uncal_preds: try: - slope, intercept = cal_dict[str(round(uncal_pred, self.bin_dist))] + slope, intercept = cal_dict[ + str(round(uncal_pred, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) except KeyError: # outside of the prediction range ... use the last # calibration curve if uncal_pred <= cal_min: - slope, intercept = cal_dict[str(round(cal_min, self.bin_dist))] + slope, intercept = cal_dict[ + str(round(cal_min, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) elif uncal_pred >= cal_max: - slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))] + slope, intercept = cal_dict[ + str(round(cal_max, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) else: - slope, intercept = cal_dict[str(round(cal_max, self.bin_dist))] + slope, intercept = cal_dict[ + str(round(cal_max, self.bin_distance)) + ] cal_preds.append(slope * (uncal_pred) + intercept) + return np.array(cal_preds) - def make_preds_core_library(self, psm_list=[], calibrate=True, mod_name=None): + def _make_preds_core_library(self, psm_list=None, calibrate=True, mod_name=None): + """Get predictions stored in library and calibrate them if needed.""" + psm_list = [] if psm_list is None else psm_list ret_preds = [] for psm in psm_list: ret_preds.append(LIBRARY[psm.peptidoform.proforma + "|" + mod_name]) if calibrate: - try: - ret_preds = self.calibration_core( - ret_preds, + if isinstance(self.calibrate_min, dict): + # if multiple models are used, use the model name to get the + # calibration values (DeepCallC mode) + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict[mod_name], self.calibrate_min[mod_name], self.calibrate_max[mod_name], ) - except: - ret_preds = self.calibration_core( - ret_preds, + else: + # if only one model is used, use the same calibration values + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict, self.calibrate_min, self.calibrate_max, ) + ret_preds = self._apply_calibration_core( + ret_preds, calibrate_dict, calibrate_min, calibrate_max + ) + return ret_preds - def make_preds_core( + def _make_preds_core( self, - X=[], - X_sum=[], - X_global=[], - X_hc=[], - psm_list=[], + X: np.ndarray | None = None, + X_sum: np.ndarray | None = None, + X_global: np.ndarray | None = None, + X_hc: np.ndarray | None = None, calibrate=True, mod_name=None, - ): - """ - Make predictions for sequences - Parameters - ---------- - seq_df : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index); will use parallel - by default! - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - calibrate : boolean - calibrate predictions or just return the predictions - correction_factor : float - correction factor to apply to predictions - mod_name : str or None - specify a model to use instead of the model assigned originally to - this instance of the object - Returns - ------- - np.array - predictions - """ + ) -> np.ndarray: + """Make predictions.""" + # Check calibration state if calibrate: - assert ( - self.calibrate_dict - ), "DeepLC instance is not yet calibrated.\ - Calibrate before making predictions, or use calibrate=False" - - if len(X) == 0 and len(psm_list) > 0: - if self.verbose: - logger.debug("Extracting features for the CNN model ...") - X = self.do_f_extraction_psm_list_parallel(psm_list) - - X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate( - ( - np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values())), - ), - axis=1, + assert self.calibrate_dict, ( + "DeepLC instance is not yet calibrated. Calibrate before making predictions, or " + "use `calibrate=False`" ) - X_hc = np.stack(list(X["matrix_hc"].values())) - X = np.stack(list(X["matrix"].values())) - elif len(X) == 0 and len(psm_list) == 0: - return [] + + if len(X) == 0: + return np.array([]) + + dataset = DeepLCDataset(X, X_sum, X_global, X_hc) + loader = DataLoader(dataset, batch_size=self.batch_num_tf, shuffle=False) ret_preds = [] - mod = load_model(mod_name) + mod = torch.load(mod_name, weights_only=False, map_location=torch.device("cpu")) + mod.eval() try: - X - ret_preds = mod.predict( - [X, X_sum, X_global, X_hc], - batch_size=self.batch_num_tf, - verbose=int(self.verbose), - ).flatten() + with torch.no_grad(): + for batch in loader: + batch_X, batch_X_sum, batch_X_global, batch_X_hc = batch + batch_preds = mod(batch_X, batch_X_sum, batch_X_global, batch_X_hc) + ret_preds.append(batch_preds.detach().cpu().numpy()) + + ret_preds = np.concatenate(ret_preds, axis=0) except UnboundLocalError: logger.debug("X is empty, skipping...") ret_preds = [] if calibrate: - try: - ret_preds = self.calibration_core( - ret_preds, + if isinstance(self.calibrate_min, dict): + # if multiple models are used, use the model name to get the + # calibration values (DeepCallC mode) + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict[mod_name], self.calibrate_min[mod_name], self.calibrate_max[mod_name], ) - except: - ret_preds = self.calibration_core( - ret_preds, + else: + # if only one model is used, use the same calibration values + calibrate_dict, calibrate_min, calibrate_max = ( self.calibrate_dict, self.calibrate_min, self.calibrate_max, ) - # clear_session() + ret_preds = self._apply_calibration_core( + ret_preds, calibrate_dict, calibrate_min, calibrate_max + ) + gc.collect() return ret_preds def make_preds( - self, psm_list=None, infile="", calibrate=True, seq_df=None, mod_name=None + self, + psm_list: PSMList | None = None, + infile: str | Path | None = None, + seq_df: pd.DataFrame | None = None, + calibrate: bool = True, + mod_name: str | None = None, ): """ Make predictions for sequences, in batches if required. Parameters ---------- - seq_df : object :: pd.DataFrame - dataframe containing the sequences (column:seq), modifications - (column:modifications) and naming (column:index); will use parallel - by default! - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - calibrate : boolean - calibrate predictions or just return the predictions - correction_factor : float - correction factor to apply to predictions - mod_name : str or None - specify a model to use instead of the model assigned originally to - this instance of the object + psm_list + PSMList object containing the peptidoforms to predict for. + infile + Path to a file containing the peptidoforms to predict for. + seq_df + DataFrame containing the sequences (column:seq), modifications + (column:modifications) and naming (column:index). + calibrate + calibrate predictions or just return the predictions. + mod_name + specify a model to use instead of the model assigned originally to this instance of the + object. Returns ------- np.array predictions """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - if self.predict_ccs: - for seq, mod, ident, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - ) - ) + if psm_list is None: + if seq_df is not None: + psm_list = _dataframe_to_psm_list(seq_df) + elif infile is not None: + psm_list = _file_to_psm_list(infile) else: - for seq, mod, ident in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index - ): - list_of_psms.append( - PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=ident) - ) - psm_list = PSMList(psm_list=list_of_psms) - - if len(infile) > 0: - psm_list = read_file(infile) - if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv( - os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "unimod/map_mq_file.csv", - ), - index_col=0, - )["value"].to_dict() - psm_list.rename_modifications(mapper) + raise ValueError( + "Either `psm_list` or `seq_df` or `infile` must be provided." + ) + + if len(psm_list) == 0: + logger.warning("No PSMs to predict for.") + return [] ret_preds_batches = [] for psm_list_t in divide_chunks(psm_list, self.batch_num): ret_preds = [] - if len(psm_list_t) > 0: - if self.verbose: - logger.debug("Extracting features for the CNN model ...") - - X = self.do_f_extraction_psm_list_parallel(psm_list_t) - X_sum = np.stack(list(X["matrix_sum"].values())) - X_global = np.concatenate( - ( - np.stack(list(X["matrix_all"].values())), - np.stack(list(X["pos_matrix"].values())), - ), - axis=1, - ) - X_hc = np.stack(list(X["matrix_hc"].values())) - X = np.stack(list(X["matrix"].values())) - else: - return [] - if isinstance(self.model, dict): - for m_group_name, m_name in self.model.items(): - ret_preds.append( - self.make_preds_core( - X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=m_name, - ) - ) - ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) - elif mod_name is not None: - ret_preds = self.make_preds_core( - X=X, - X_sum=X_sum, - X_global=X_global, - X_hc=X_hc, - calibrate=calibrate, - mod_name=mod_name, - ) + # Extract features for the CNN model + # features = self._extract_features(psm_list_t) + # X_sum, X_global, X_hc, X = unpack_features(features) + X, X_sum, X_global, X_hc = self._prepare_feature_matrices(psm_list_t) + + # Check if model was provided, and if not, whether multiple models are selected in + # the DeepLC object or not. + if mod_name: + model_names = [mod_name] + elif isinstance(self.model, dict): + model_names = [m_name for m_group_name, m_name in self.model.items()] elif isinstance(self.model, list): - for m_name in self.model: - ret_preds.append( - self.make_preds_core( + model_names = self.model + elif isinstance(self.model, str): + model_names = [self.model] + else: + raise ValueError("Invalid model name provided.") + + # Get predictions + if len(model_names) > 1: + # Iterate over models if multiple were selected + model_predictions = [] + for model_name in model_names: + model_predictions.append( + self._make_preds_core( X=X, X_sum=X_sum, X_global=X_global, X_hc=X_hc, calibrate=calibrate, - mod_name=m_name, + mod_name=model_name, ) ) - ret_preds = np.array([sum(a) / len(a) for a in zip(*ret_preds)]) + # Average the predictions from all models + ret_preds = np.array( + [sum(a) / len(a) for a in zip(*ret_preds, strict=True)] + ) + # ret_preds = np.mean(model_predictions, axis=0) + else: - ret_preds = self.make_preds_core( + # Use the single model + ret_preds = self._make_preds_core( X=X, X_sum=X_sum, X_global=X_global, X_hc=X_hc, calibrate=calibrate, - mod_name=self.model, + mod_name=model_names[0], ) - ret_preds_batches.extend(ret_preds) - return ret_preds_batches - # TODO make this multithreaded - # should be possible with the batched list + ret_preds_batches.append(ret_preds) - def calibrate_preds_func_pygam( - self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - measured_tr=None, - use_median=True, - mod_name=None, - ): - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - # TODO include charge here - if self.predict_ccs: - for seq, mod, ident, tr, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) - else: - for seq, mod, ident, tr in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) + all_ret_preds = np.concatenate(ret_preds_batches, axis=0) - measured_tr = [psm.retention_time for psm in psm_list] + return all_ret_preds - predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) + def _calibrate_preds_pygam( + self, + measured_tr: np.ndarray, + predicted_tr: np.ndarray, + ) -> tuple[float, float, list[BaseEstimator]]: + """Make calibration curve for predictions using PyGAM.""" + logger.debug("Getting predictions for calibration...") # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) for mtr, ptr in sorted( - zip(measured_tr, predicted_tr), key=lambda pair: pair[1] + zip(measured_tr, predicted_tr, strict=True), key=lambda pair: pair[1] ) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort], dtype=np.float32) predicted_tr = np.array([ptr for mtr, ptr in tr_sort], dtype=np.float32) - # predicted_tr = list(predicted_tr) - # measured_tr = list(measured_tr) - # Fit a SplineTransformer model if self.deeplc_retrain: spline = SplineTransformer(degree=2, n_knots=10) @@ -889,94 +827,18 @@ def calibrate_preds_func_pygam( [linear_model_left, spline_model, linear_model_right], ) - def calibrate_preds_func( + def _calibrate_preds_piecewise_linear( self, - psm_list=None, - correction_factor=1.0, - seq_df=None, - use_median=True, - mod_name=None, - ): - """ - Make calibration curve for predictions - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods - measured_tr : list - measured tr of the peptides; should correspond to seqs, identifiers, - and mods - correction_factor : float - correction factor that needs to be applied to the supplied measured - trs - seq_df : object :: pd.DataFrame - a pd.DataFrame that contains the sequences, modifications and - observed retention times to fit a calibration curve - use_median : boolean - flag to indicate we need to use the median valuein a window to - perform calibration - mod_name - specify a model to use instead of the model assigned originally to - this instance of the object - - Returns - ------- - float - the minimum value where a calibration curve was fitted, lower values - will be extrapolated from the minimum fit of the calibration curve - float - the maximum value where a calibration curve was fitted, higher values - will be extrapolated from the maximum fit of the calibration curve - dict - dictionary with keys for rounded tr, and the values concern a linear - model that should be applied to do calibration (!!! what is the - shape of this?) - """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - # TODO include charge here - if self.predict_ccs: - for seq, mod, tr, ident, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df["tr"], - seq_df.index, - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) - else: - for seq, mod, tr, ident in zip( - seq_df["seq"], seq_df["modifications"], seq_df["tr"], seq_df.index - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - - measured_tr = [psm.retention_time for psm in psm_list] - - predicted_tr = self.make_preds(psm_list, calibrate=False, mod_name=mod_name) - + measured_tr: np.ndarray, + predicted_tr: np.ndarray, + use_median: bool = True, + ) -> tuple[float, float, dict[str, tuple[float]]]: + """Make calibration curve for predictions.""" # sort two lists, predicted and observed based on measured tr tr_sort = [ (mtr, ptr) for mtr, ptr in sorted( - zip(measured_tr, predicted_tr), key=lambda pair: pair[1] + zip(measured_tr, predicted_tr, strict=False), key=lambda pair: pair[1] ) ] measured_tr = np.array([mtr for mtr, ptr in tr_sort]) @@ -989,10 +851,9 @@ def calibrate_preds_func( calibrate_min = float("inf") calibrate_max = 0 - if self.verbose: - logger.debug( - "Selecting the data points for calibration (used to fit the linear models between)" - ) + logger.debug( + "Selecting the data points for calibration (used to fit the linear models between)" + ) # smooth between observed and predicted split_val = predicted_tr[-1] / self.split_cal @@ -1021,16 +882,13 @@ def calibrate_preds_func( mtr_mean.append(sum(mtr) / len(mtr)) ptr_mean.append(sum(ptr) / len(ptr)) - if self.verbose: - logger.debug("Fitting the linear models between the points") + logger.debug("Fitting the linear models between the points") if self.split_cal >= len(measured_tr): raise CalibrationError( - "Not enough measured tr ({}) for the chosen number of splits ({}). " - "Choose a smaller split_cal parameter or provide more peptides for " - "fitting the calibration curve.".format( - len(measured_tr), self.split_cal - ) + f"Not enough measured tr ({len(measured_tr)}) for the chosen number of splits " + f"({self.split_cal}). Choose a smaller split_cal parameter or provide more " + "peptides for fitting the calibration curve." ) if len(mtr_mean) == 0: raise CalibrationError( @@ -1054,112 +912,86 @@ def calibrate_preds_func( # optimized predictions using a dict to find calibration curve very # fast for v in np.arange( - round(ptr_mean[i], self.bin_dist), - round(ptr_mean[i + 1], self.bin_dist), - 1 / ((self.bin_dist) * self.dict_cal_divider), + round(ptr_mean[i], self.bin_distance), + round(ptr_mean[i + 1], self.bin_distance), + 1 / ((self.bin_distance) * self.dict_cal_divider), ): if v < calibrate_min: calibrate_min = v if v > calibrate_max: calibrate_max = v - calibrate_dict[str(round(v, self.bin_dist))] = [slope, intercept] + calibrate_dict[str(round(v, self.bin_distance))] = (slope, intercept) return calibrate_min, calibrate_max, calibrate_dict def calibrate_preds( self, - psm_list=None, - infile="", - measured_tr=[], - correction_factor=1.0, - location_retraining_models="", - psm_utils_obj=None, - sample_for_calibration_curve=None, - seq_df=None, - use_median=True, + psm_list: PSMList | None = None, + infile: str | Path | None = None, + seq_df: pd.DataFrame | None = None, + measured_tr: np.ndarray | None = None, + location_retraining_models: str = "", + sample_for_calibration_curve: int | None = None, + use_median: bool = True, return_plotly_report=False, - ): + ) -> dict | None: """ Find best model and calibrate. Parameters ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : list - identifiers of the peptides; should correspond to seqs and mods + psm_list + PSMList object containing the peptidoforms to predict for. + infile + Path to a file containing the peptidoforms to predict for. + seq_df + DataFrame containing the sequences (column:seq), modifications (column:modifications), + naming (column:index), and optionally charge (column:charge) and measured retention + time (column:tr). measured_tr : list - measured tr of the peptides; should correspond to seqs, identifiers, - and mods + Measured retention time used for calibration. Should correspond to the PSMs in the + provided PSMs. If None, the measured retention time is taken from the PSMs. correction_factor : float - correction factor that needs to be applied to the supplied measured - trs - seq_df : object :: pd.DataFrame - a pd.DataFrame that contains the sequences, modifications and - observed retention times to fit a calibration curve - use_median : boolean - flag to indicate we need to use the median valuein a window to - perform calibration + correction factor that needs to be applied to the supplied measured tr's + location_retraining_models + Location to save the retraining models; if None, a temporary directory is used. + sample_for_calibration_curve + Number of PSMs to sample for calibration curve; if None, all provided PSMs are used. + use_median + Whether to use the median value in a window to perform calibration; only applies to + piecewise linear calibration, not to PyGAM calibration. + return_plotly_report + Whether to return a plotly report with the calibration results. Returns ------- + dict | None + Dictionary with plotly report information or None. """ - if type(seq_df) == pd.core.frame.DataFrame: - list_of_psms = [] - if self.predict_ccs: - for seq, mod, ident, tr, z in zip( - seq_df["seq"], - seq_df["modifications"], - seq_df.index, - seq_df["tr"], - seq_df["charge"], - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod, charge=z), - spectrum_id=ident, - retention_time=tr, - ) - ) + # Getting PSMs either from psm_list, seq_df, or infile + if psm_list is None: + if seq_df is not None: + psm_list = _dataframe_to_psm_list(seq_df) + elif infile is not None: + psm_list = _file_to_psm_list(infile) else: - for seq, mod, ident, tr in zip( - seq_df["seq"], seq_df["modifications"], seq_df.index, seq_df["tr"] - ): - list_of_psms.append( - PSM( - peptidoform=peprec_to_proforma(seq, mod), - spectrum_id=ident, - retention_time=tr, - ) - ) - psm_list = PSMList(psm_list=list_of_psms) - elif psm_utils_obj: - psm_list = psm_utils_obj + raise ValueError( + "Either `psm_list` or `seq_df` or `infile` must be provided." + ) + + # Getting measured retention time either from measured_tr or provided PSMs + if not measured_tr: + measured_tr = [psm.retention_time for psm in psm_list] + if None in measured_tr: + raise ValueError("Not all PSMs have an observed retention time.") + # Ensuring self.model is list of strings if isinstance(self.model, str): self.model = [self.model] - if len(infile) > 0: - psm_list = read_file(infile) - if "msms" in infile and ".txt" in infile: - mapper = pd.read_csv( - os.path.join( - os.path.dirname(os.path.realpath(__file__)), - "unimod/map_mq_file.csv", - ), - index_col=0, - )["value"].to_dict() - psm_list.rename_modifications(mapper) - - measured_tr = [psm.retention_time for psm in psm_list] - - if self.verbose: - logger.debug("Start to calibrate predictions ...") - if self.verbose: - logger.debug("Ready to find the best model out of: %s" % (self.model)) + logger.debug("Start to calibrate predictions ...") + logger.debug(f"Ready to find the best model out of: {self.model}") best_perf = float("inf") best_calibrate_min = 0.0 @@ -1174,81 +1006,86 @@ def calibrate_preds( temp_pred = [] if self.deeplc_retrain: - # The following code is not required in most cases, but here it is used to clear variables that might cause problems - _ = tf.Variable([1]) + logger.debug("Preparing for model fine-tuning...") - context._context = None - context._create_context() + X, X_sum, X_global, X_hc = self._prepare_feature_matrices(psm_list) + dataset = DeepLCDataset(X, X_sum, X_global, X_hc, np.array(measured_tr)) - tf.config.threading.set_inter_op_parallelism_threads(1) + base_model_path = ( + self.model[0] if isinstance(self.model, list) else self.model + ) + base_model = torch.load( + base_model_path, weights_only=False, map_location=torch.device("cpu") + ) + base_model.eval() - if len(location_retraining_models) > 0: - t_dir_models = TemporaryDirectory().name - os.mkdir(t_dir_models) + fine_tuner = DeepLCFineTuner( + model=base_model, + train_data=dataset, + batch_size=self.batch_num_tf, + epochs=self.n_epochs, + ) + # fine_tuner._freeze_layers() + fine_tuned_model = fine_tuner.fine_tune() + + if location_retraining_models: + os.makedirs(location_retraining_models, exist_ok=True) + temp_dir_obj = TemporaryDirectory() + t_dir_models = temp_dir_obj.name + self._temp_dir_obj = temp_dir_obj else: t_dir_models = location_retraining_models - try: - os.mkdir(t_dir_models) - except: - pass - - # Here we will apply transfer learning we specify previously trained models in the 'mods_transfer_learning' - models = deeplcretrainer.retrain( - {"deeplc_transferlearn": psm_list}, - outpath=t_dir_models, - mods_transfer_learning=self.model, - freeze_layers=True, - n_epochs=self.n_epochs, - freeze_after_concat=1, - verbose=self.verbose, - ) + os.makedirs(t_dir_models, exist_ok=True) - self.model = models + # Define path to save fine-tuned model + fine_tuned_model_path = os.path.join(t_dir_models, "fine_tuned_model.pth") + torch.save(fine_tuned_model, fine_tuned_model_path) + self.model = [fine_tuned_model_path] - if isinstance(sample_for_calibration_curve, int): + # Limit calibration to a subset of PSMs if specified + if sample_for_calibration_curve: psm_list = random.sample(list(psm_list), sample_for_calibration_curve) measured_tr = [psm.retention_time for psm in psm_list] - for m in self.model: - if self.verbose: - logger.debug("Trying out the following model: %s" % (m)) + for model_name in self.model: + logger.debug(f"Trying out the following model: {model_name}") + predicted_tr = self.make_preds( + psm_list, calibrate=False, mod_name=model_name + ) + if self.pygam_calibration: - calibrate_output = self.calibrate_preds_func_pygam( - psm_list, - measured_tr=measured_tr, - correction_factor=correction_factor, - seq_df=seq_df, - use_median=use_median, - mod_name=m, + calibrate_output = self._calibrate_preds_pygam( + measured_tr, predicted_tr ) else: - calibrate_output = self.calibrate_preds_func( - psm_list, - correction_factor=correction_factor, - seq_df=seq_df, - use_median=use_median, - mod_name=m, + calibrate_output = self._calibrate_preds_piecewise_linear( + measured_tr, predicted_tr, use_median=use_median ) + self.calibrate_min, self.calibrate_max, self.calibrate_dict = ( + calibrate_output + ) + # TODO: Currently, calibration dict can be both a dict (linear) or a list of models + # (PyGAM)... This should be handled better in the future. + + # Skip this model if calibrate_dict is empty + # TODO: Should this do something when using PyGAM and calibrate_dict is a list? + if ( + isinstance(self.calibrate_dict, dict) + and len(self.calibrate_dict.keys()) == 0 + ): + continue - ( - self.calibrate_min, - self.calibrate_max, - self.calibrate_dict, - ) = calibrate_output - - if type(self.calibrate_dict) == dict: - if len(self.calibrate_dict.keys()) == 0: - continue - - m_name = m.split("/")[-1] - - preds = self.make_preds(psm_list, calibrate=True, seq_df=seq_df, mod_name=m) + m_name = model_name.split("/")[-1] - if self.deepcallc_mod: - m_group_name = "deepcallc" - else: - m_group_name = "_".join(m_name.split("_")[:-1]) + # Get new predictions with calibration + preds = self.make_preds( + psm_list, calibrate=True, seq_df=seq_df, mod_name=model_name + ) + m_group_name = ( + "deepcallc" if self.deepcallc_mod else "_".join(m_name.split("_")[:-1]) + ) + m = model_name try: pred_dict[m_group_name][m] = preds mod_dict[m_group_name][m] = m @@ -1268,26 +1105,24 @@ def calibrate_preds( mod_calibrate_min_dict[m_group_name][m] = self.calibrate_min mod_calibrate_max_dict[m_group_name][m] = self.calibrate_max - for m_name in pred_dict.keys(): - preds = [sum(a) / len(a) for a in zip(*list(pred_dict[m_name].values()))] + for m_name in pred_dict: + preds = [ + sum(a) / len(a) + for a in zip(*list(pred_dict[m_name].values()), strict=True) + ] if len(measured_tr) == 0: perf = sum(abs(seq_df["tr"] - preds)) else: perf = sum(abs(np.array(measured_tr) - np.array(preds))) - if self.verbose: - logger.debug( - "For %s model got a performance of: %s" - % (m_name, perf / len(preds)) - ) + logger.debug( + f"For {m_name} model got a performance of: {perf / len(preds)}" + ) if perf < best_perf: - if self.deepcallc_mod: - m_group_name = "deepcallc" - else: - m_group_name = m_name - # TODO is deepcopy really required? + m_group_name = "deepcallc" if self.deepcallc_mod else m_name + # TODO is deepcopy really required? best_calibrate_dict = copy.deepcopy(mod_calibrate_dict[m_group_name]) best_calibrate_min = copy.deepcopy(mod_calibrate_min_dict[m_group_name]) best_calibrate_max = copy.deepcopy(mod_calibrate_max_dict[m_group_name]) @@ -1304,21 +1139,22 @@ def calibrate_preds( self.model = best_model if self.deepcallc_mod: - self.deepcallc_model = train_en( + self.deepcallc_model = train_elastic_net( pd.DataFrame(pred_dict["deepcallc"]), seq_df["tr"] ) - # self.n_jobs = 1 - - logger.debug("Model with the best performance got selected: %s" % (best_model)) + logger.debug(f"Model with the best performance got selected: {best_model}") if return_plotly_report: import deeplc.plot plotly_return_dict = {} plotly_df = pd.DataFrame( - list(zip(temp_obs, temp_pred)), - columns=["Observed retention time", "Predicted retention time"], + list(zip(temp_obs, temp_pred, strict=True)), + columns=[ + "Observed retention time", + "Predicted retention time", + ], ) plotly_return_dict["scatter"] = deeplc.plot.scatter(plotly_df) plotly_return_dict["baseline_dist"] = deeplc.plot.distribution_baseline( @@ -1326,26 +1162,97 @@ def calibrate_preds( ) return plotly_return_dict - return {} + return None - def split_seq(self, a, n): - """ - Split a list (a) into multiple chunks (n) - - Parameters - ---------- - a : list - list to split - n : list - number of chunks - Returns - ------- - list - chunked list - """ +def _get_pool(n_jobs: int) -> multiprocessing.Pool | multiprocessing.dummy.Pool: # type: ignore + """Get a Pool object for parallel processing.""" + if multiprocessing.current_process().daemon: + logger.warning( + "DeepLC is running in a daemon process. Disabling multiprocessing as daemonic " + "processes can't have children." + ) + return multiprocessing.dummy.Pool(1) + elif n_jobs == 1: + return multiprocessing.dummy.Pool(1) + else: + max_n_jobs = multiprocessing.cpu_count() + if n_jobs > max_n_jobs: + logger.warning( + f"Number of jobs ({n_jobs}) exceeds the number of CPUs ({max_n_jobs}). " + f"Setting number of jobs to {max_n_jobs}." + ) + return multiprocessing.Pool(max_n_jobs) + else: + return multiprocessing.Pool(n_jobs) + + +def _lists_to_psm_list( + sequences: list[str], + modifications: list[str | None], + identifiers: list[str], + charges: list[int] | None, + retention_times: list[float] | None = None, + n_jobs: int = 1, +) -> PSMList: + """Convert lists into a PSMList using Dask for parallel processing.""" + if not charges: + charges = [None] * len(sequences) + + if not retention_times: + retention_times = [None] * len(sequences) + + def create_psm(args): + sequence, modifications, identifier, charge, retention_time = args + return PSM( + peptidoform=peprec_to_proforma(sequence, modifications, charge=charge), + spectrum_id=identifier, + retention_time=retention_time, + ) - # since chunking is not alway possible do the modulo of residues - k, m = divmod(len(a), n) - result = (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) - return result + args_list = list( + zip( + sequences, modifications, identifiers, charges, retention_times, strict=True + ) + ) + tasks = [delayed(create_psm)(args) for args in args_list] + list_of_psms = list(compute(*tasks, scheduler="processes")) + return PSMList(psm_list=list_of_psms) + + +# TODO: I'm not sure what the expected behavior was for charges; they were parsed +# from the dataframe, while the passed list was used to check whether it they should get +# parsed. I'll allow both with a priority for the passed charges. +def _dataframe_to_psm_list( + dataframe: pd.DataFrame, + charges: list[int] | None, + n_jobs: int = 1, +) -> PSMList: + """Convert a DataFrame with sequences, modifications, and identifiers into a PSMList.""" + sequences = dataframe["seq"] + modifications = dataframe["modifications"] + identifiers = dataframe.index + retention_times = dataframe["tr"] if "tr" in dataframe.columns else None + + if not charges and "charge" in dataframe.columns: + charges = dataframe["charge"] + + return _lists_to_psm_list( + sequences, modifications, identifiers, charges, retention_times, n_jobs=n_jobs + ) + + +def _file_to_psm_list(input_file: str | Path) -> PSMList: + """Read a file into a PSMList, optionally mapping MaxQuant modifications labels.""" + psm_list = read_file(input_file) + if "msms" in input_file and ".txt" in input_file: + mapper = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "unimod/map_mq_file.csv", + ), + index_col=0, + )["value"].to_dict() + psm_list.rename_modifications(mapper) + + return psm_list diff --git a/deeplc/feat_extractor.py b/deeplc/feat_extractor.py index 1b12262..ecda640 100644 --- a/deeplc/feat_extractor.py +++ b/deeplc/feat_extractor.py @@ -1,659 +1,233 @@ -""" -This code is used to extract features for peptides used in DeepLC (or seperately -if required). +"""Feature extraction for DeepLC.""" -For the library versions see the .yml file -""" +from __future__ import annotations -__author__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__credits__ = [ - "Robbin Bouwmeester", - "Ralf Gabriels", - "Arthur Declercq", - "Prof. Lennart Martens", - "Sven Degroeve", -] -__license__ = "Apache License, Version 2.0" -__maintainer__ = ["Robbin Bouwmeester", "Ralf Gabriels"] -__email__ = ["Robbin.Bouwmeester@ugent.be", "Ralf.Gabriels@ugent.be"] - -# Native imports -import os -import math -import time -from configparser import ConfigParser -import ast -from re import sub import logging +import warnings +from re import sub import numpy as np -import pandas as pd -from psm_utils.io.peptide_record import peprec_to_proforma -from psm_utils.psm import PSM -from psm_utils.psm_list import PSMList +from psm_utils.psm import Peptidoform from pyteomics import mass logger = logging.getLogger(__name__) -class FeatExtractor: - """ - Place holder, fill later - - Parameters - ---------- - - Returns - ------- - - """ - - def __init__( - self, - main_path=os.path.dirname(os.path.realpath(__file__)), - lib_path_mod=os.path.join( - os.path.dirname(os.path.realpath(__file__)), "unimod/" - ), - lib_aa_composition=os.path.join( - os.path.dirname(os.path.realpath(__file__)), "aa_comp_rel.csv" - ), - split_size=7, - verbose=True, - include_specific_posses=[0, 1, 2, 3, 4, 5, 6, -1, -2, -3, -4, -5, -6, -7], - add_sum_feat=False, - ptm_add_feat=False, - ptm_subtract_feat=False, - add_rolling_feat=False, - include_unnormalized=True, - add_comp_feat=False, - cnn_feats=True, - ignore_mods=False, - config_file=None, - ): - # if a config file is defined overwrite standard parameters - if config_file: - cparser = ConfigParser() - cparser.read(config_file) - lib_path_mod = cparser.get("featExtractor", "lib_path_mod").strip('"') - split_size = cparser.getint("featExtractor", "split_size") - verbose = cparser.getboolean("featExtractor", "verbose") - add_sum_feat = cparser.getboolean("featExtractor", "add_sum_feat") - ptm_add_feat = cparser.getboolean("featExtractor", "ptm_add_feat") - ptm_subtract_feat = cparser.getboolean("featExtractor", "ptm_subtract_feat") - add_rolling_feat = cparser.getboolean("featExtractor", "add_rolling_feat") - include_unnormalized = cparser.getboolean( - "featExtractor", "include_unnormalized" - ) - include_specific_posses = ast.literal_eval( - cparser.get("featExtractor", "include_specific_posses") - ) - - self.main_path = main_path - - # Get the atomic composition of AAs - self.lib_aa_composition = self.get_aa_composition(lib_aa_composition) - - self.split_size = split_size - self.verbose = verbose - - self.add_sum_feat = add_sum_feat - self.ptm_add_feat = ptm_add_feat - self.ptm_subtract_feat = ptm_subtract_feat - self.add_rolling_feat = add_rolling_feat - self.cnn_feats = cnn_feats - self.include_unnormalized = include_unnormalized - self.include_specific_posses = include_specific_posses - self.add_comp_feat = add_comp_feat - self.ignore_mods = ignore_mods - - def __str__(self): - return """ - _____ _ _____ __ _ _ _ - | __ \ | | / ____| / _| | | | | | | - | | | | ___ ___ _ __ | | | | ______| |_ ___ __ _| |_ _____ _| |_ _ __ __ _ ___| |_ ___ _ __ - | | | |/ _ \/ _ \ '_ \| | | | |______| _/ _ \/ _` | __| / _ \ \/ / __| '__/ _` |/ __| __/ _ \| '__| - | |__| | __/ __/ |_) | |___| |____ | || __/ (_| | |_ | __/> <| |_| | | (_| | (__| || (_) | | - |_____/ \___|\___| .__/|______\_____| |_| \___|\__,_|\__| \___/_/\_\\__|_| \__,_|\___|\__\___/|_| - | | - |_| - """ - - def get_aa_composition(self, file_loc): - """ - Read amino acid atomic composition and return a dictionary - - Parameters - ---------- - file_loc : str - location of the (csv) file that contains the atomic compositions of AAs. The first column must contain - the one-letter AA code. The remaining columns contain counts for each atom (each atom in seperate - column). An example is: - - aa,C,H,N,O,S - A,1,2,0,0,0 - R,4,9,3,0,0 - N,2,3,1,1,0 - - Returns - ------- - dict - dictionary that goes from one-letter AA to atomic composition - """ - return pd.read_csv(file_loc, index_col=0).T.to_dict() - - def split_seq(self, a, n): - """ - Split a list (a) into multiple chunks (n) - - Parameters - ---------- - a : list - list to split - n : list - number of chunks - - Returns - ------- - list - chunked list - """ - # since chunking is not alway possible do the modulo of residues - k, m = divmod(len(a), n) - return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) - - def calc_feats_mods(self, formula): - """ - Chemical formula to atom addition/subtraction - - Parameters - ---------- - formula : str - chemical formula - - Returns - ------- - list - atom naming - list - number of atom added/subtracted - """ - if not formula: - return [], [] - if len(str(formula)) == 0: - return [], [] - if not isinstance(formula, str): - if math.isnan(formula): - return [], [] - - new_atoms = [] - new_num_atoms = [] - for atom in formula.split(" "): - if "(" not in atom: - atom_symbol = atom - num_atom = 1 - else: - atom_symbol = atom.split("(")[0] - num_atom = atom.split("(")[1].rstrip(")") - new_atoms.append(atom_symbol) - new_num_atoms.append(int(num_atom)) - return new_atoms, new_num_atoms - - def get_feats_mods( - self, - seqs, - mods, - identifiers, - split_size=False, - atoms_order=set(["H", "C", "N", "O", "P", "S"]), - add_str="_sum", - subtract_mods=False, - ): - """ - Chemical formula to atom addition/subtraction - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : str - identifiers of the peptides; should correspond to seqs and mods - split_size : int - overwrite the set split size if needed - atoms_order : set - atoms to include and the order - add_str : str - add this substring to feature naming - subtract_mods : boolean - calculate the atom that are substracted in the PTM - - Returns - ------- - object :: pd.DataFrame - feature matrix for peptide PTMs - """ - if not split_size: - split_size = self.split_size - if self.verbose: - t0 = time.time() - mod_dict = {} - look_up_mod_subtract = {} - look_up_mod_add = {} - - len_init = len( - [ao + str(spl_s) for spl_s in range(split_size) for ao in atoms_order] - ) - for index_name, mod, seq in zip(identifiers, mods, seqs): - mod_dict[index_name] = dict( - zip( - [ - ao + str(spl_s) + add_str - for spl_s in range(split_size) - for ao in atoms_order - ], - [0] * len_init, - ) - ) - - if not mod: - continue - if len(str(mod)) == 0: - continue - if not isinstance(mod, str): - if math.isnan(mod): - continue - - if self.verbose: - logger.debug( - "Time to calculate mod features: %s seconds" % (time.time() - t0) - ) - df_ret = pd.DataFrame(mod_dict, dtype=int).T - del mod_dict - return df_ret - - def encode_atoms( - self, - psm_list, - indexes, - charges=[], - predict_ccs=False, - padding_length=60, - positions=set([0, 1, 2, 3, -1, -2, -3, -4]), - positions_pos=set([0, 1, 2, 3]), - positions_neg=set([-1, -2, -3, -4]), - sum_mods=2, - dict_aa={ - "K": 0, - "R": 1, - "P": 2, - "T": 3, - "N": 4, - "A": 5, - "Q": 6, - "V": 7, - "S": 8, - "G": 9, - "I": 10, - "L": 11, - "C": 12, - "M": 13, - "H": 14, - "F": 15, - "Y": 16, - "W": 17, - "E": 18, - "D": 19, - }, - dict_index_pos={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - dict_index_all={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - dict_index={"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5}, - ): - """ - Extract all features we can extract... Probably the function you want to call by default - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods_all : list - naming of the mods; should correspond to seqs and identifiers - indexes : list - identifiers of the peptides; should correspond to seqs and mods - padding_length : int - indicates padding length with 'X'-characters. Shorter sequences are padded. Longer sequences - are sliced shorter (C-terminal > than padding length will be missing) - positions : list - list of positions to include separately, for the C-terminus - provide negative indices - sum_mods : int - value that is used to feed the second head of cerberus with summed information, for example, - a value of 2 would sum the composition of each 2 AAs (i.e. 1+2, 3+4, 5+6 ...) - dict_index_pos : dict - index position of atom for positional features - dict_index_all : dict - index position of atom for overall compositional features - dict_index : dict - index position of atom for compositional features for the whole peptide (each position) - charges : list - optional list with charges, keep empty if these will not effect the predicted value - - Returns - ------- - object :: pd.DataFrame - feature matrix (np.matrix) of all positions (up till padding length) - object :: pd.DataFrame - feature matrix (np.matrix) of summed positions (up till padding length / sum_mods) - object :: pd.DataFrame - feature matrix (np.matrix) of specific positions (from positions argument) - object :: pd.DataFrame - feature matrix (np.matrix) of summed composition - """ - - # Local helper to ensure each unique warning is logged only once. - logged_warnings = set() - - def warn_once(message): - if message not in logged_warnings: - logged_warnings.add(message) - logger.warning(message) - - # TODO param flag for CCS prediction - def rolling_sum(a, n=2): - ret = np.cumsum(a, axis=1, dtype=np.float32) - ret[:, n:] = ret[:, n:] - ret[:, :-n] - return ret[:, n - 1 :] - - ret_list = {} - ret_list["matrix"] = {} - ret_list["matrix_sum"] = {} - ret_list["matrix_all"] = {} - ret_list["pos_matrix"] = {} - ret_list["matrix_hc"] = {} - - # Iterate over all instances - for psm, row_index in zip(psm_list, indexes): - peptidoform = psm.peptidoform - seq = peptidoform.sequence - seq_len = len(seq) - charge = psm.get_precursor_charge() - - # For now anything longer than padding length is cut away - # (C-terminal cutting) - if seq_len > padding_length: - seq = seq[0:padding_length] - seq_len = len(seq) - warn_once("Truncating peptide (too long): %s" % (seq)) - - peptide_composition = [mass.std_aa_comp[aa] for aa in seq] - - # Initialize all feature matrices - matrix = np.zeros( - (padding_length, len(dict_index.keys())), dtype=np.float16 - ) - matrix_hc = np.zeros( - (padding_length, len(dict_aa.keys())), dtype=np.float16 - ) - matrix_pos = np.zeros( - (len(positions), len(dict_index.keys())), dtype=np.float16 - ) - - for i, position_composition in enumerate(peptide_composition): - for k, v in position_composition.items(): - try: - matrix[i, dict_index[k]] = v - except IndexError: - warn_once( - f"Could not add the following value: pos {i} for atom {k} with value {v}" - ) - except KeyError: - warn_once( - f"Could not add the following value: pos {i} for atom {k} with value {v}" - ) - - for p in positions_pos: - try: - aa = seq[p] - except: - warn_once(f"Unable to get the following position: {p}") - continue - for atom, val in mass.std_aa_comp[aa].items(): - try: - matrix_pos[p, dict_index_pos[atom]] = val - except KeyError: - warn_once(f"Could not add the following atom: {atom}") - except IndexError: - warn_once(f"Could not add the following atom: {p} {atom} {val}") - - for pn in positions_neg: - try: - aa = seq[seq_len + pn] - except: - warn_once(f"Unable to get the following position: {p}") - continue - for atom, val in mass.std_aa_comp[aa].items(): - try: - matrix_pos[pn, dict_index_pos[atom]] = val - except KeyError: - warn_once(f"Could not add the following atom: {atom}") - except IndexError: - warn_once( - f"Could not add the following atom: {pn} {atom} {val}" - ) - - for i, peptide_position in enumerate(peptidoform.parsed_sequence): - try: - matrix_hc[i, dict_aa[peptide_position[0]]] = 1.0 - except KeyError: - warn_once( - f"Skipping the following (not in library): {i} {peptide_position}" - ) - except IndexError: - warn_once( - f"Could not add the following atom: {i} {peptide_position}" - ) - - if peptide_position[1] is not None: - try: - modification_composition = peptide_position[1][0].composition - except KeyError: - warn_once( - f"Skipping the following (not in library): {peptide_position[1]}" - ) - continue - except: - warn_once( - f"Skipping the following (not in library): {peptide_position[1]}" - ) - continue - - for ( - atom_position_composition, - atom_change, - ) in modification_composition.items(): - try: - matrix[ - i, dict_index[atom_position_composition] - ] += atom_change - if i in positions: - matrix_pos[ - i, dict_index_pos[atom_position_composition] - ] += atom_change - elif i - seq_len in positions: - matrix_pos[ - i - seq_len, - dict_index_pos[atom_position_composition], - ] += atom_change - except KeyError: - try: - warn_once( - f"Could not add the following atom: {atom_position_composition}, attempting to replace the [] part" - ) - atom_position_composition = sub( - "\[.*?\]", "", atom_position_composition - ) - matrix[ - i, dict_index[atom_position_composition] - ] += atom_change - if i in positions: - matrix_pos[ - i, dict_index_pos[atom_position_composition] - ] += atom_change - elif i - seq_len in positions: - matrix_pos[ - i - seq_len, - dict_index_pos[atom_position_composition], - ] += atom_change - except KeyError: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - except: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - except IndexError: - warn_once( - f"Could not add the following atom: {i} {atom_position_composition} {atom_change}" - ) - except: - warn_once( - f"Could not add the following atom: {atom_position_composition}, second attempt, now ignored" - ) - continue - - matrix_all = np.sum(matrix, axis=0) - matrix_all = np.append(matrix_all, seq_len) - - if predict_ccs: - matrix_all = np.append(matrix_all, (seq.count("H")) / float(seq_len)) - matrix_all = np.append( - matrix_all, - (seq.count("F") + seq.count("W") + seq.count("Y")) / float(seq_len), - ) - matrix_all = np.append( - matrix_all, (seq.count("D") + seq.count("E")) / float(seq_len) - ) - matrix_all = np.append( - matrix_all, (seq.count("K") + seq.count("R")) / float(seq_len) - ) - matrix_all = np.append(matrix_all, charge) - - matrix_sum = rolling_sum(matrix.T, n=2)[:, ::2].T - - ret_list["matrix"][row_index] = matrix - ret_list["matrix_sum"][row_index] = matrix_sum - ret_list["matrix_all"][row_index] = matrix_all - ret_list["pos_matrix"][row_index] = matrix_pos.flatten() - ret_list["matrix_hc"][row_index] = matrix_hc - - return ret_list - - def full_feat_extract( - self, - psm_list=[], - seqs=[], - mods=[], - predict_ccs=False, - identifiers=[], - charges=[], - ): - """ - Extract all features we can extract... Probably the function your want to call by default - - Parameters - ---------- - seqs : list - peptide sequence list; should correspond to mods and identifiers - mods : list - naming of the mods; should correspond to seqs and identifiers - identifiers : str - identifiers of the peptides; should correspond to seqs and mods - charges : list - optional list with charges, keep emtpy if these will not effect the predicted value - - Returns - ------- - pd.DataFrame - feature matrix - """ - # TODO Reintroduce for CCS, check CCS flag - if len(seqs) > 0: - list_of_psms = [] - for seq, mod, id in zip(seqs, mods, identifiers): - list_of_psms.append( - PSM(peptidoform=peprec_to_proforma(seq, mod), spectrum_id=id) - ) - psm_list = PSMList(psm_list=list_of_psms) - - if self.verbose: - t0 = time.time() - - # TODO pass CCS flag - if self.add_sum_feat: - if self.verbose: - logger.debug("Extracting compositional sum features for modifications") - X_feats_sum = self.get_feats_mods(psm_list, split_size=1, add_str="_sum") - if self.ptm_add_feat: - if self.verbose: - logger.debug("Extracting compositional add features for modifications") - X_feats_add = self.get_feats_mods( - psm_list, split_size=self.split_size, add_str="_add" - ) - if self.ptm_subtract_feat: - if self.verbose: - logger.debug( - "Extracting compositional subtract features for modifications" - ) - X_feats_neg = self.get_feats_mods( - psm_list, - split_size=self.split_size, - add_str="_subtract", - subtract_mods=True, - ) - if self.cnn_feats: - if self.verbose: - logger.debug("Extracting CNN features") - X_cnn = self.encode_atoms( - psm_list, - list(range(len(psm_list))), - charges=charges, - predict_ccs=predict_ccs, - ) - - if self.cnn_feats: - X = X_cnn - if self.add_sum_feat: +# fmt: off +DEFAULT_POSITIONS: set[int] = {0, 1, 2, 3, -1, -2, -3, -4} +DEFAULT_POSITIONS_POS: set[int] = {0, 1, 2, 3} +DEFAULT_POSITIONS_NEG: set[int] = {-1, -2, -3, -4} +DEFAULT_DICT_AA: dict[str, int] = { + "K": 0, "R": 1, "P": 2, "T": 3, "N": 4, "A": 5, "Q": 6, "V": 7, "S": 8, "G": 9, "I": 10, + "L": 11, "C": 12, "M": 13, "H": 14, "F": 15, "Y": 16, "W": 17, "E": 18, "D": 19, +} +DEFAULT_DICT_INDEX_POS: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5} +DEFAULT_DICT_INDEX: dict[str, int] = {"C": 0, "H": 1, "N": 2, "O": 3, "S": 4, "P": 5} +# fmt: on + + +def _truncate_sequence(seq: str, max_length: int) -> tuple[str, int]: + """Truncate the sequence if it exceeds the max_length.""" + if len(seq) > max_length: + warnings.warn(f"Truncating peptide (too long): {seq}", stacklevel=2) + seq = seq[:max_length] + return seq, len(seq) + + +def _fill_standard_matrix(seq: str, padding_length: int, dict_index: dict[str, int]) -> np.ndarray: + """Fill the standard composition matrix using mass.std_aa_comp.""" + mat = np.zeros((padding_length, len(dict_index)), dtype=np.float16) + for i, aa in enumerate(seq): + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_sum], axis=1) - except BaseException: - X = X_feats_sum - if self.ptm_add_feat: + mat[i, dict_index[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Skipping atom {atom} at pos {i}", stacklevel=2) + return mat + + +def _fill_onehot_matrix( + parsed_seq: list, padding_length: int, dict_aa: dict[str, int] +) -> np.ndarray: + """Fill a one-hot matrix based on the parsed sequence tokens.""" + onehot = np.zeros((padding_length, len(dict_aa)), dtype=np.float16) + for i, token in enumerate(parsed_seq): + try: + onehot[i, dict_aa[token[0]]] = 1.0 + except (KeyError, IndexError): + warnings.warn(f"One-hot skip: {i} {token}", stacklevel=2) + return onehot + + +def _fill_pos_matrix( + seq: str, + seq_len: int, + positions_pos: set[int], + positions_neg: set[int], + dict_index: dict[str, int], + dict_index_pos: dict[str, int], +) -> np.ndarray: + """Fill positional matrix for atoms at specific positions.""" + pos_total = positions_pos.union(positions_neg) + pos_mat = np.zeros((max(pos_total) - min(pos_total) + 1, len(dict_index)), dtype=np.float16) + # For positive positions + for pos in positions_pos: + try: + aa = seq[pos] + except Exception: + warnings.warn(f"Unable to get pos {pos}", stacklevel=2) + continue + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_add], axis=1) - except BaseException: - X = X_feats_add - if self.ptm_subtract_feat: + # shift index for matrix row since positions may be negative. + pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Pos matrix skip: {atom} at pos {pos}", stacklevel=2) + # For negative positions + for pos in positions_neg: + try: + aa = seq[seq_len + pos] + except Exception: + warnings.warn(f"Unable to get pos {pos}", stacklevel=2) + continue + for atom, value in mass.std_aa_comp[aa].items(): try: - X = pd.concat([X, X_feats_neg], axis=1) - except BaseException: - X = X_feats_neg - - if self.verbose: - logger.debug( - "Time to calculate all features: %s seconds" % (time.time() - t0) - ) - return X - - -def main(verbose=True): - f_extractor = FeatExtractor(config_file="config.ini") - df = pd.read_csv("parse_pride/seqs_exp.csv") - df.index = ["Pep_" + str(dfi) for dfi in df.index] - print(f_extractor.full_feat_extract(df["seq"], df["modifications"], df.index)) - + pos_mat[pos - min(pos_total), dict_index_pos[atom]] = value + except (KeyError, IndexError): + warnings.warn(f"Pos matrix skip: {atom} at neg pos {pos}", stacklevel=2) + return pos_mat + + +def _apply_modifications( + mat: np.ndarray, + pos_mat: np.ndarray, + parsed_seq: list, + seq_len: int, + dict_index: dict[str, int], + dict_index_pos: dict[str, int], + positions: set[int], +) -> None: + """Apply modification changes to the matrices.""" + for i, token in enumerate(parsed_seq): + if token[1] is None: + continue + try: + mod_comp = token[1][0].composition + except Exception: + warnings.warn(f"Skipping mod at pos {i}: {token[1]}", stacklevel=2) + continue + for atom_comp, change in mod_comp.items(): + try: + mat[i, dict_index[atom_comp]] += change + if i in positions: + pos_mat[i, dict_index_pos[atom_comp]] += change + elif (i - seq_len) in positions: + pos_mat[i - seq_len, dict_index_pos[atom_comp]] += change + except KeyError: + try: + warnings.warn(f"Replacing pattern for atom: {atom_comp}", stacklevel=2) + atom_comp_clean = sub(r"\[.*?\]", "", atom_comp) + mat[i, dict_index[atom_comp_clean]] += change + if i in positions: + pos_mat[i, dict_index_pos[atom_comp_clean]] += change + elif (i - seq_len) in positions: + pos_mat[i - seq_len, dict_index_pos[atom_comp_clean]] += change + except KeyError: + warnings.warn(f"Ignoring atom {atom_comp} at pos {i}", stacklevel=2) + continue + except IndexError: + warnings.warn(f"Index error for atom {atom_comp} at pos {i}", stacklevel=2) + + +def _compute_rolling_sum(matrix: np.ndarray, n: int = 2) -> np.ndarray: + """Compute a rolling sum over the matrix.""" + ret = np.cumsum(matrix, axis=1, dtype=np.float32) + ret[:, n:] = ret[:, n:] - ret[:, :-n] + return ret[:, n - 1 :] + + +def encode_peptidoform( + peptidoform: Peptidoform | str, + predict_ccs: bool = False, + padding_length: int = 60, + positions: set[int] | None = None, + positions_pos: set[int] | None = None, + positions_neg: set[int] | None = None, + dict_aa: dict[str, int] | None = None, + dict_index_pos: dict[str, int] | None = None, + dict_index: dict[str, int] | None = None, +) -> dict[str, np.ndarray]: + """Extract features from a single peptidoform.""" + positions = positions or DEFAULT_POSITIONS + positions_pos = positions_pos or DEFAULT_POSITIONS_POS + positions_neg = positions_neg or DEFAULT_POSITIONS_NEG + dict_aa = dict_aa or DEFAULT_DICT_AA + dict_index_pos = dict_index_pos or DEFAULT_DICT_INDEX_POS + dict_index = dict_index or DEFAULT_DICT_INDEX + + if isinstance(peptidoform, str): + peptidoform = Peptidoform(peptidoform) + seq = peptidoform.sequence + charge = peptidoform.precursor_charge + seq, seq_len = _truncate_sequence(seq, padding_length) + + std_matrix = _fill_standard_matrix(seq, padding_length, dict_index) + onehot_matrix = _fill_onehot_matrix(peptidoform.parsed_sequence, padding_length, dict_aa) + pos_matrix = _fill_pos_matrix( + seq, seq_len, positions_pos, positions_neg, dict_index, dict_index_pos + ) + _apply_modifications( + std_matrix, + pos_matrix, + peptidoform.parsed_sequence, + seq_len, + dict_index, + dict_index_pos, + positions, + ) + + matrix_all = np.sum(std_matrix, axis=0) + matrix_all = np.append(matrix_all, seq_len) + if predict_ccs: + matrix_all = np.append(matrix_all, (seq.count("H")) / seq_len) + matrix_all = np.append( + matrix_all, (seq.count("F") + seq.count("W") + seq.count("Y")) / seq_len + ) + matrix_all = np.append(matrix_all, (seq.count("D") + seq.count("E")) / seq_len) + matrix_all = np.append(matrix_all, (seq.count("K") + seq.count("R")) / seq_len) + matrix_all = np.append(matrix_all, charge) + + matrix_sum = _compute_rolling_sum(std_matrix.T, n=2)[:, ::2].T + + return { + "matrix": std_matrix, + "matrix_sum": matrix_sum, + "matrix_all": matrix_all, + "pos_matrix": pos_matrix.flatten(), + "matrix_hc": onehot_matrix, + } + + +def aggregate_encodings( + encodings: list[dict[str, np.ndarray]], +) -> dict[str, dict[int, np.ndarray]]: + """Aggregate list of encodings into single dictionary.""" + return {key: {i: enc[key] for i, enc in enumerate(encodings)} for key in encodings[0]} + + +def unpack_features( + features: dict[str, np.ndarray], +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Unpack dictionary with features to numpy arrays.""" + X_sum = np.stack(list(features["matrix_sum"].values())) + X_global = np.concatenate( + ( + np.stack(list(features["matrix_all"].values())), + np.stack(list(features["pos_matrix"].values())), + ), + axis=1, + ) + X_hc = np.stack(list(features["matrix_hc"].values())) + X_main = np.stack(list(features["matrix"].values())) -if __name__ == "__main__": - main() + return X_sum, X_global, X_hc, X_main diff --git a/deeplc/gui.py b/deeplc/gui.py index ae4cf27..47ee007 100644 --- a/deeplc/gui.py +++ b/deeplc/gui.py @@ -1,8 +1,10 @@ """Graphical user interface.""" + import sys from multiprocessing import freeze_support from pathlib import Path import importlib.resources + try: from gooey import Gooey, local_resource_path except ImportError: @@ -17,7 +19,7 @@ # Get path to package_data/images # Workaround with parent of specific file required for Python 3.9+ support -with importlib.resources.path(img_module, 'config_icon.png') as resource: +with importlib.resources.path(img_module, "config_icon.png") as resource: _IMG_DIR = Path(resource).parent @@ -27,12 +29,13 @@ tabbed_groups=True, default_size=(720, 480), monospace_display=True, - target=None if getattr(sys, 'frozen', False) else "deeplc-gui" + target=None if getattr(sys, "frozen", False) else "deeplc-gui", ) def start_gui(): """Run main with GUI enabled.""" freeze_support() # Required for multiprocessing with PyInstaller main(gui=True) + if __name__ == "__main__": start_gui() diff --git a/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras b/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras deleted file mode 100644 index 1a1c6af..0000000 Binary files a/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras and /dev/null differ diff --git a/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.pt b/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.pt new file mode 100644 index 0000000..82399a4 Binary files /dev/null and b/deeplc/mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.pt differ diff --git a/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras b/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras deleted file mode 100644 index 7016886..0000000 Binary files a/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras and /dev/null differ diff --git a/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.pt b/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.pt new file mode 100644 index 0000000..2d67b07 Binary files /dev/null and b/deeplc/mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.pt differ diff --git a/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras b/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras deleted file mode 100644 index e2d23ba..0000000 Binary files a/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras and /dev/null differ diff --git a/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.pt b/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.pt new file mode 100644 index 0000000..292a21a Binary files /dev/null and b/deeplc/mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.pt differ diff --git a/deeplc/onnx_model.py b/deeplc/onnx_model.py new file mode 100644 index 0000000..bef6e9b --- /dev/null +++ b/deeplc/onnx_model.py @@ -0,0 +1,31 @@ + +import os +from tensorflow.keras.models import load_model +import tensorflow as tf +import tf2onnx +from onnx2torch import convert +import torch + +deeplc_dir = os.path.dirname(os.path.realpath(__file__)) +DEFAULT_MODELS = [ + "mods/full_hc_PXD005573_pub_1fd8363d9af9dcad3be7553c39396960.keras", + "mods/full_hc_PXD005573_pub_8c22d89667368f2f02ad996469ba157e.keras", + "mods/full_hc_PXD005573_pub_cb975cfdd4105f97efa0b3afffe075cc.keras", +] +DEFAULT_MODELS = [os.path.join(deeplc_dir, dm) for dm in DEFAULT_MODELS] +def _convert_to_onnx(): + for model_path in DEFAULT_MODELS: + if os.path.exists(model_path): + mod = load_model(model_path) + spec = [ + tf.TensorSpec([None, 60, 6], tf.float32, name="input_1"), + tf.TensorSpec([None, 30, 6], tf.float32, name="input_2"), + tf.TensorSpec([None, 55], tf.float32, name="input_3"), + tf.TensorSpec([None, 60, 20], tf.float32, name="input_4"), + ] + onnx_model, _ = tf2onnx.convert.from_keras(mod, input_signature=spec, opset=13) + torch_model = convert(onnx_model) + torch.save(torch_model, model_path.replace(".keras", ".pt")) + +if __name__ == "__main__": + _convert_to_onnx() \ No newline at end of file diff --git a/deeplc/plot.py b/deeplc/plot.py index fd337d0..a4b2a37 100644 --- a/deeplc/plot.py +++ b/deeplc/plot.py @@ -12,7 +12,7 @@ def scatter( observed_column: str = "Observed retention time", xaxis_label: str = "Observed retention time", yaxis_label: str = "Predicted retention time", - plot_title: str = "Predicted vs. observed retention times" + plot_title: str = "Predicted vs. observed retention times", ) -> go.Figure: """ Plot a scatter plot of the predicted vs. observed retention times. @@ -42,15 +42,15 @@ def scatter( y=predicted_column, opacity=0.3, ) - + # Draw diagonal line fig.add_scatter( x=[min(df[observed_column]), max(df[observed_column])], y=[min(df[observed_column]), max(df[observed_column])], - mode='lines', - line=dict(color='red', width=3, dash='dash'), + mode="lines", + line=dict(color="red", width=3, dash="dash"), ) - + # Hide legend fig.update_layout( title=plot_title, @@ -58,7 +58,7 @@ def scatter( xaxis_title=xaxis_label, yaxis_title=yaxis_label, ) - + return fig @@ -85,9 +85,7 @@ def distribution_baseline( """ # Get baseline data baseline_df = pd.read_csv( - Path(__file__) - .absolute() - .parent.joinpath("baseline_performance/baseline_predictions.csv") + Path(__file__).absolute().parent.joinpath("baseline_performance/baseline_predictions.csv") ) baseline_df["rel_mae_best"] = baseline_df[ ["rel_mae_transfer_learning", "rel_mae_new_model", "rel_mae_calibrate"] @@ -97,9 +95,7 @@ def distribution_baseline( # Calculate current RMAE and percentile compared to baseline mae = sum(abs(df[observed_column] - df[predicted_column])) / len(df.index) mae_rel = (mae / max(df[observed_column])) * 100 - percentile = round( - (baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1 - ) + percentile = round((baseline_df["rel_mae_transfer_learning"] < mae_rel).mean() * 100, 1) # Calculate x-axis range with 5% padding all_values = np.append(baseline_df["rel_mae_transfer_learning"].values, mae_rel) @@ -140,10 +136,7 @@ def distribution_baseline( ) fig.update_xaxes(range=[x_min, x_max]) fig.update_layout( - title=( - f"Current DeepLC performance compared to {len(baseline_df.index)} " - "datasets" - ), + title=(f"Current DeepLC performance compared to {len(baseline_df.index)} datasets"), xaxis_title="Relative mean absolute error (%)", ) diff --git a/deeplc/trainl3.py b/deeplc/trainl3.py index a28e083..1682485 100644 --- a/deeplc/trainl3.py +++ b/deeplc/trainl3.py @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. -This project was made possible by MASSTRPLAN. MASSTRPLAN received funding -from the Marie Sklodowska-Curie EU Framework for Research and Innovation +This project was made possible by MASSTRPLAN. MASSTRPLAN received funding +from the Marie Sklodowska-Curie EU Framework for Research and Innovation Horizon 2020, under Grant Agreement No. 675132. """ @@ -28,7 +28,7 @@ _has_sklearn = True -def train_en(X, y, n_jobs=16, cv=None): +def train_elastic_net(X, y, n_jobs=16, cv=None): """ Function that trains Layer 3 of CALLC (elastic net) @@ -99,7 +99,7 @@ def train_en(X, y, n_jobs=16, cv=None): grid.fit(X, y) crossv_mod.set_params(**grid.best_params_) - + ret_mod.set_params(**grid.best_params_) ret_mod.fit(X, y) diff --git a/examples/deeplc_example.py b/examples/deeplc_example.py index 95a5c5d..2a44b2b 100644 --- a/examples/deeplc_example.py +++ b/examples/deeplc_example.py @@ -1,9 +1,10 @@ # Imports import pandas as pd from matplotlib import pyplot as plt + from deeplc import DeepLC, FeatExtractor -if __name__ == '__main__': +if __name__ == "__main__": # Input files peptide_file = "examples/datasets/dia.csv" @@ -14,14 +15,14 @@ df = df.fillna("") # Generate some identifiers, any kind of identifiers will do - df.index = ["Pep_"+str(dfi) for dfi in df.index] + df.index = ["Pep_" + str(dfi) for dfi in df.index] # Make a feature extraction object. # This step can be skipped if you want to use the default feature extraction # settings. In this example we will use a model that does not use RDKit features # so we skip the chemical descriptor making procedure. - #f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False) + # f_extractor = FeatExtractor(chem_descr_feat=False,verbose=False) f_extractor = FeatExtractor( add_sum_feat=False, ptm_add_feat=False, @@ -30,26 +31,28 @@ chem_descr_feat=False, add_comp_feat=False, cnn_feats=True, - verbose=True + verbose=True, ) # Initiate a DeepLC instance that will perform the calibration and predictions dlc = DeepLC( - path_model=["deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5"], + path_model=[ + "deeplc/mods/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "deeplc/mods/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "deeplc/mods/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "deeplc/mods/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + ], cnn_model=True, f_extractor=f_extractor, verbose=True, write_library=True, - use_library="deeplc/library/library.csv" + use_library="deeplc/library/library.csv", ) df["tr"] = list(range(len(df.index))) # To demonstrate DeepLC's callibration, we'll induce some an artificial # transformation into the retention times - df["tr"] = df["tr"]**0.85 + df["tr"] = df["tr"] ** 0.85 # Calibrate the original model based on the new retention times dlc.calibrate_preds(seq_df=df) @@ -57,19 +60,19 @@ print("a") # Make predictions; calibrated and uncalibrated - preds_cal = dlc.make_preds(seq_df=df) + preds_cal = dlc._make_preds(seq_df=df) print("b") - preds_uncal = dlc.make_preds(seq_df=df, calibrate=False) + preds_uncal = dlc._make_preds(seq_df=df, calibrate=False) print("c") # Compare calibrated and uncalibrated predictions - #print("Predictions (calibrated): ", preds_cal) - #print("Predictions (uncalibrated): ", preds_uncal) + # print("Predictions (calibrated): ", preds_cal) + # print("Predictions (uncalibrated): ", preds_uncal) - plt.scatter(df["tr"],preds_cal,label="Calibrated",s=1) - plt.scatter(df["tr"],preds_uncal,label="Uncalibrated",s=1) + plt.scatter(df["tr"], preds_cal, label="Calibrated", s=1) + plt.scatter(df["tr"], preds_uncal, label="Uncalibrated", s=1) plt.legend() - plt.savefig('deeplc_calibrated_vs_uncalibrated.png') + plt.savefig("deeplc_calibrated_vs_uncalibrated.png") diff --git a/pyproject.toml b/pyproject.toml index 31e8fe3..49df3f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "deeplc" -version = "3.1.8" +version = "4.0.0-dev1" description = "DeepLC: Retention time prediction for (modified) peptides using Deep Learning." readme = "README.md" license = { file = "LICENSE" } @@ -9,6 +9,7 @@ authors = [ { name = "Niels Hulstaert" }, { name = "Arthur Declercq" }, { name = "Ralf Gabriels" }, + { name = "Alireza Nameni" }, { name = "Lennart Martens" }, { name = "Sven Degroeve" }, ] @@ -31,11 +32,12 @@ keywords = [ ] dependencies = [ - "tensorflow>=2.15.0,<3", + "dask>2023", + "torch>=2.6.0,<3", + "onnx2torch>=0.1.5", "numpy>=1.17", "pandas>=0.25", "scikit-learn>=1.2.0", - "deeplcretrainer>=1,<2", "psm_utils>=0.2.3" ] @@ -61,3 +63,13 @@ build-backend = "setuptools.build_meta" [tool.setuptools] packages = ["deeplc"] include-package-data = true + +[tool.ruff] +line-length = 99 +target-version = 'py311' + +[tool.ruff.lint] +select = ["E", "F", "UP", "B", "SIM", "I"] + +[tool.ruff.format] +docstring-code-format = true diff --git a/streamlit/deeplc_streamlit.py b/streamlit/deeplc_streamlit.py index 399dc5d..fcd574d 100644 --- a/streamlit/deeplc_streamlit.py +++ b/streamlit/deeplc_streamlit.py @@ -109,7 +109,8 @@ def _main_page(self): st.subheader("Prediction speed boost") self.user_input["use_library"] = st.checkbox( - "Use prediction library for speed-up", help=self.texts.Help.use_library + "Use prediction library for speed-up", + help=self.texts.Help.use_library, ) st.markdown(self.texts.Help.use_library_agreement) @@ -334,9 +335,9 @@ class Sidebar: --- Currently using the following package versions:
- [![DeepLC](https://img.shields.io/badge/deeplc-{version('deeplc')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc) - [![Tensorflow](https://img.shields.io/badge/tensorflow-{version('tensorflow')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow) - [![Streamlit](https://img.shields.io/badge/streamlit-{version('streamlit')}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit) + [![DeepLC](https://img.shields.io/badge/deeplc-{version("deeplc")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/compomics/deeplc) + [![Tensorflow](https://img.shields.io/badge/tensorflow-{version("tensorflow")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/tensorflow/tensorflow) + [![Streamlit](https://img.shields.io/badge/streamlit-{version("streamlit")}-blue?style=flat-square&logoColor=white&logo=pypi)](https://github.com/streamlit/streamlit) Latest DeepLC version:
![PyPI](https://img.shields.io/pypi/v/deeplc?style=flat-square&logoColor=white&logo=pypi) diff --git a/streamlit/streamlit_utils.py b/streamlit/streamlit_utils.py index 3e6bec5..c3a7df7 100644 --- a/streamlit/streamlit_utils.py +++ b/streamlit/streamlit_utils.py @@ -80,4 +80,4 @@ def hide_streamlit_menu(): @st.cache def save_dataframe(df): """Save dataframe to file object, with streamlit cache.""" - return df.to_csv().encode('utf-8') + return df.to_csv().encode("utf-8") diff --git a/test.png b/test.png deleted file mode 100644 index 4e3138e..0000000 Binary files a/test.png and /dev/null differ diff --git a/tests/test_deepcallc.py b/tests/test_deepcallc.py index 5505b51..798be6e 100644 --- a/tests/test_deepcallc.py +++ b/tests/test_deepcallc.py @@ -1,77 +1,84 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + + def main(): peptide_file = "temp_data/PXD005573_mcp.csv" calibration_file = "temp_data/PXD005573_mcp.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - + cal_df["modifications"] = cal_df["modifications"].fillna("") + pep_df = pep_df.sample(50) cal_df = cal_df.sample(50) - mods = ["C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5", - #"C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", - "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5"] - - dlc = DeepLC(write_library=False, - use_library="", - pygam_calibration=True, - deepcallc_mod=True, - path_model=mods, - reload_library=False) + mods = [ + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_1fd8363d9af9dcad3be7553c39396960.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_8c22d89667368f2f02ad996469ba157e.hdf5", + # "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD005573_mcp_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_PXD008783_median_calibrate_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_dia_fixed_mods_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_5ee8aaa41d387bfffb8cda966348937c.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_phospho_kai_li_f3c75e74dd7b16180edf6f6f0d78a4a6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_prosit_ptm_2020_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_5ee8aaa41d387bfffb8cda966348937c.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_tmt_data_consensus_ticnum_filtered_8c488fed5e0d0b07cf217fe3c30e55c6.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_120min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_arabidopsis_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_hf_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_hela_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_pancreas_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_1h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_plasma_lumos_2h_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_1fd8363d9af9dcad3be7553c39396960.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_8c22d89667368f2f02ad996469ba157e.hdf5", + "C:/Users/robbin/Documents/Github/DeepLCModels/full_hc_yeast_60min_psms_aligned_cb975cfdd4105f97efa0b3afffe075cc.hdf5", + ] + + dlc = DeepLC( + write_library=False, + use_library="", + pygam_calibration=True, + deepcallc_mod=True, + path_model=mods, + reload_library=False, + ) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) - plt.scatter(cal_df["tr"],preds) + plt.scatter(cal_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_deeplc.py b/tests/test_deeplc.py index 4a5af2f..ed60f92 100644 --- a/tests/test_deeplc.py +++ b/tests/test_deeplc.py @@ -22,9 +22,9 @@ def _r2_score(y_true, y_pred): def test_cli_basic(): """Test command line interface help message.""" - assert ( - subprocess.getstatusoutput("deeplc -h")[0] == 0 - ), "`deeplc -h` returned non-zero status code" + assert subprocess.getstatusoutput("deeplc -h")[0] == 0, ( + "`deeplc -h` returned non-zero status code" + ) def test_cli_full(): @@ -48,8 +48,10 @@ def test_cli_full(): train_df = pd.read_csv(file_path_pred) model_r2 = _r2_score(train_df["tr"], preds_df["predicted retention time"]) logging.info("DeepLC R2 score on %s: %f", file_path_pred, model_r2) - assert model_r2 > 0.90, f"DeepLC R2 score on {file_path_pred} below 0.9 \ + assert model_r2 > 0.90, ( + f"DeepLC R2 score on {file_path_pred} below 0.9 \ (was {model_r2})" + ) if __name__ == "__main__": diff --git a/tests/test_deeplc_lib.py b/tests/test_deeplc_lib.py index a17748c..c984196 100644 --- a/tests/test_deeplc_lib.py +++ b/tests/test_deeplc_lib.py @@ -1,28 +1,29 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + + def main(): peptide_file = "examples/datasets/test_pred.csv" calibration_file = "examples/datasets/test_train.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - - dlc = DeepLC(write_library=False, - use_library="", - reload_library=False) - #write_library=True, - #use_library="lib.csv", - #reload_library=True) + cal_df["modifications"] = cal_df["modifications"].fillna("") + + dlc = DeepLC(write_library=False, use_library="", reload_library=False) + # write_library=True, + # use_library="lib.csv", + # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) - plt.scatter(pep_df["tr"],preds) + plt.scatter(pep_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_deeplc_retrain.py b/tests/test_deeplc_retrain.py index a67c7a4..365819b 100644 --- a/tests/test_deeplc_retrain.py +++ b/tests/test_deeplc_retrain.py @@ -1,28 +1,29 @@ import pandas as pd -from deeplc import DeepLC from matplotlib import pyplot as plt +from deeplc import DeepLC + + def main(): peptide_file = "examples/datasets/test_pred.csv" calibration_file = "examples/datasets/test_train.csv" pep_df = pd.read_csv(peptide_file, sep=",") - pep_df['modifications'] = pep_df['modifications'].fillna("") + pep_df["modifications"] = pep_df["modifications"].fillna("") cal_df = pd.read_csv(calibration_file, sep=",") - cal_df['modifications'] = cal_df['modifications'].fillna("") - - dlc = DeepLC(write_library=False, - deeplc_retrain=True, - reload_library=False) - #write_library=True, - #use_library="lib.csv", - #reload_library=True) + cal_df["modifications"] = cal_df["modifications"].fillna("") + + dlc = DeepLC(write_library=False, deeplc_retrain=True, reload_library=False) + # write_library=True, + # use_library="lib.csv", + # reload_library=True) dlc.calibrate_preds(seq_df=cal_df) - preds = dlc.make_preds(seq_df=cal_df) + preds = dlc._make_preds(seq_df=cal_df) - plt.scatter(cal_df["tr"],preds) + plt.scatter(cal_df["tr"], preds) plt.show() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tests/test_feat_extractor.py b/tests/test_feat_extractor.py new file mode 100644 index 0000000..8ecc39e --- /dev/null +++ b/tests/test_feat_extractor.py @@ -0,0 +1,93 @@ +import numpy as np +from psm_utils.psm import Peptidoform + +from deeplc.feat_extractor import encode_peptidoform + + +def _check_result_structure(result: dict[str, np.ndarray]) -> None: + expected_keys = {"matrix", "matrix_sum", "matrix_all", "pos_matrix", "matrix_hc"} + assert expected_keys.issubset(result.keys()) + + +def test_encode_peptidoform_without_modification(): + # Test using a simple peptidoform string without modifications. + peptide = "ACDE/2" + result = encode_peptidoform(peptide) + _check_result_structure(result) + + # Default padding_length is 60 and there are 6 atoms in DEFAULT_DICT_INDEX. + assert result["matrix"].shape == (60, 6) + # matrix_all is formed by summing matrix (length 6) and appending the sequence length. + # So without CCS prediction, matrix_all length should be 7. + assert result["matrix_all"].shape == (7,) + + # The sequence is taken from the part before the '/'. + expected_seq = "ACDE" + # The last element of matrix_all should be the sequence length. + assert result["matrix_all"][-1] == len(expected_seq) + + +def test_encode_peptidoform_with_modification(): + # Test using a peptidoform string with a modification. + peptide = "AC[Carbamidomethyl]DE/2" + result = encode_peptidoform(peptide) + _check_result_structure(result) + + # Check basic structure as before. + assert result["matrix"].shape == (60, 6) + expected_seq = "ACDE" # Modification does not alter the base sequence. + assert result["matrix_all"][-1] == len(expected_seq) + + +def test_encode_peptidoform_with_ccs_prediction(): + # Test with predict_ccs=True. + peptide = "ACDE/2" + result = encode_peptidoform(peptide, predict_ccs=True) + _check_result_structure(result) + + # Without predict_ccs, matrix_all has 7 elements (6 sums + sequence length). + # With predict_ccs, 5 additional values are appended (ratios and charge), + # resulting in a total length of 12. + assert result["matrix_all"].shape == (12,) + + # Verify that the last element (charge) matches the precursor charge from Peptidoform. + pf = Peptidoform(peptide) + assert result["matrix_all"][-1] == pf.precursor_charge + + +def test_encode_peptidoform_feature_values() -> None: + """ + Test that the returned feature values match the expected results + for a known peptidoform. + """ + peptide = "ACDE/2" + result = encode_peptidoform(peptide, predict_ccs=False, padding_length=60) + + # For peptide "ACDE", using DEFAULT_DICT_INDEX = {"C":0,"H":1,"N":2,"O":3,"S":4,"P":5} + # Based on pyteomics.mass.std_aa_comp, the expected composition is: + # For 'A': { "C": 3, "H": 5, "N": 1, "O": 1 } -> row0: [3,5,1,1,0,0] + # For 'C': { "C": 3, "H": 5, "N": 1, "O": 1, "S": 1 } -> row1: [3,5,1,1,1,0] + # For 'D': { "C": 4, "H": 5, "N": 1, "O": 3 } -> row2: [4,5,1,3,0,0] + # For 'E': { "C": 5, "H": 7, "N": 1, "O": 3 } -> row3: [5,7,1,3,0,0] + # + # Sum each column over positions 0 to 3: + # C: 3+3+4+5 = 15 + # H: 5+5+5+7 = 22 + # N: 1+1+1+1 = 4 + # O: 1+1+3+3 = 8 + # S: 0+1+0+0 = 1 + # P: 0+0+0+0 = 0 + # Then sequence length appended: 4 + expected_matrix_all = np.array([15, 22, 4, 8, 1, 0, 4], dtype=np.float32) + + # Check matrix_all values using np.allclose. + assert np.allclose(result["matrix_all"], expected_matrix_all), ( + f"Expected matrix_all: {expected_matrix_all}, got: {result['matrix_all']}" + ) + + # Also check the first row of the matrix for letter 'A'. + # Expected for 'A': [3,5,1,1,0,0] + expected_row_A = np.array([3, 5, 1, 1, 0, 0], dtype=np.float16) + assert np.allclose(result["matrix"][0][:6], expected_row_A), ( + f"Expected row for A: {expected_row_A}, got: {result['matrix'][0][:6]}" + )