diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..7c163e2 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,15 @@ + +[run] +branch = True +source = sourced/ml/core + +[report] +exclude_lines = + no cover + raise NotImplementedError + if __name__ == "__main__": +ignore_errors = True +omit = + sourced/ml/core/tests/* + sourced/ml/core/swivel.py + sourced/ml/core/bigartm.py diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..d6f48de --- /dev/null +++ b/.flake8 @@ -0,0 +1,17 @@ +[flake8] +ignore=B008,E121,E123,E126,E203,E226,E24,E704,W503,W504,D100,D105,D200,D301,D402 +max-line-length=99 +exclude= + .git + doc +inline-quotes=" +import-order-style=appnexus +application-package-names=sourced.ml.core +per-file-ignores= + **/tests/**:D + # Should be resolved one by one + # Related issue: https://github.com/src-d/ml/issues/354 + ./sourced/ml/core/extractors/*:D + ./sourced/ml/core/models/**:D + ./sourced/ml/core/algorithms/**:D + ./sourced/ml/core/utils/*:D \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..14f6144 --- /dev/null +++ b/.gitignore @@ -0,0 +1,116 @@ + +#Mac OS +*.DS_Store + +#PyCharm IDE +.idea/ + +# Documentation build files +doc/_build/ +doc/ast2vec.rst +doc/modules.rst + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# CI +.ci \ No newline at end of file diff --git a/.pylintrc b/.pylintrc new file mode 100644 index 0000000..731e424 --- /dev/null +++ b/.pylintrc @@ -0,0 +1,10 @@ +[MASTER] +jobs=0 +load-plugins=pylint.extensions.docparams + +[MESSAGES CONTROL] +disable=all +enable=missing-param-doc, + differing-param-doc, + differing-type-doc, + missing-return-doc diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..b5a6b56 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,54 @@ +language: python +sudo: true +dist: xenial +git: + depth: 9999999 +services: + - docker +cache: pip +before_cache: + - chown -R travis:travis $HOME/.cache/pip +stages: + - style + - test +_install: &_install + - travis_retry make bblfsh-start + - pip install --upgrade pip cython codecov + - ML_CORE_SETUP_INCLUDE_TESTS=1 pip install .[tf] + - cd $(pip show sourced.ml.core|grep Location|cut -d' ' -f2)/sourced/ml/core + - find . -wholename "*/tests/*" -type d -exec chmod 555 {} \; +_coverage: &_coverage + - coverage run --concurrency=multiprocessing -m unittest discover + - travis_retry coverage combine +matrix: + fast_finish: true + include: + - stage: style + python: 3.7 + script: + - make check + install: + - pip install -r requirements-lint.txt + - stage: test + python: 3.5 + script: *_coverage + install: *_install + - stage: test + python: 3.6 + script: *_coverage + install: *_install + - stage: test + python: 3.7 + script: *_coverage + install: *_install + after_success: + - codecov + - stage: test + name: Tests inside docker + script: + - make docker-build VERSION=test + - make docker-test VERSION=test + install: + - travis_retry make bblfsh-start +notifications: + email: false \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..26b9ba1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,30 @@ +FROM ubuntu:18.04 + +ENV BROWSER=/browser \ + LC_ALL=en_US.UTF-8 + +COPY requirements.txt ml_core/requirements.txt + +RUN apt-get update && \ + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates locales libxml2 libxml2-dev gcc g++ wget \ + python3 python3-dev python3-distutils && \ + echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \ + locale-gen && \ + wget -O - https://bootstrap.pypa.io/get-pip.py | python3 && \ + cd ml_core && \ + pip3 install --no-cache-dir -r requirements.txt && \ + apt-get remove -y python3-dev libxml2-dev gcc g++ wget && \ + apt-get remove -y .*-doc .*-man >/dev/null && \ + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + echo '#!/bin/bash\n\ +\n\ +echo\n\ +echo " $@"\n\ +echo\n\' > /browser && \ + chmod +x /browser + +COPY . ml_core/ +RUN cd ml_core && pip3 install -e . diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..ab7b419 --- /dev/null +++ b/Makefile @@ -0,0 +1,38 @@ +current_dir = $(shell pwd) + +PROJECT = ml_core + +DOCKERFILES = Dockerfile:$(PROJECT) +DOCKER_ORG = "srcd" + +# Including ci Makefile +CI_REPOSITORY ?= https://github.com/src-d/ci.git +CI_BRANCH ?= v1 +CI_PATH ?= .ci +MAKEFILE := $(CI_PATH)/Makefile.main +$(MAKEFILE): + git clone --quiet --depth 1 -b $(CI_BRANCH) $(CI_REPOSITORY) $(CI_PATH); +-include $(MAKEFILE) + +.PHONY: check +check: + ! (grep -R /tmp sourced/ml/core/tests) + flake8 --count + pylint sourced + +.PHONY: test +test: + python3 -m unittest discover + +.PHONY: docker-test +docker-test: + docker ps | grep bblfshd # bblfsh server should be run. Try `make bblfsh-start` command. + docker run --rm -it --network host --entrypoint python3 -w /ml_core \ + -e SKIP_BBLFSH_UTILS_TESTS=1 \ + srcd/ml_core:$(VERSION) -m unittest discover + +.PHONY: bblfsh-start +bblfsh-start: + ! docker ps | grep bblfshd # bblfsh server should not be running already + docker run -d --name ml_core_bblfshd --privileged -p 9432\:9432 bblfsh/bblfshd\:v2.12.1 + docker exec -it ml_core_bblfshd bblfshctl driver install python bblfsh/python-driver\:v2.9.0 diff --git a/README.md b/README.md index 94e49a5..a924da2 100644 --- a/README.md +++ b/README.md @@ -1,95 +1,7 @@ -# MLonCode research playground [![PyPI](https://img.shields.io/pypi/v/sourced-ml.svg)](https://pypi.python.org/pypi/sourced-ml) [![Build Status](https://travis-ci.org/src-d/ml.svg)](https://travis-ci.org/src-d/ml) [![Docker Build Status](https://img.shields.io/docker/build/srcd/ml.svg)](https://hub.docker.com/r/srcd/ml) [![codecov](https://codecov.io/github/src-d/ml/coverage.svg)](https://codecov.io/gh/src-d/ml) +# MLonCode Core Library + [![Build Status](https://travis-ci.org/src-d/ml-core.svg)](https://travis-ci.org/src-d/ml-core) + [![codecov](https://codecov.io/github/src-d/ml-core/coverage.svg)](https://codecov.io/gh/src-d/ml-core) + [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black) -This project is the foundation for [MLonCode](https://github.com/src-d/awesome-machine-learning-on-source-code) research and development. It abstracts feature extraction and training models, thus allowing to focus on the higher level tasks. - -Currently, the following models are implemented: - -* BOW - weighted bag of x, where x is many different extracted feature types. -* id2vec, source code identifier embeddings. -* docfreq, feature document frequencies \(part of TF-IDF\). -* topic modeling over source code identifiers. - -It is written in Python3 and has been tested on Linux and macOS. source{d} core-ml is tightly -coupled with [source{d} engine](https://engine.sourced.tech) and delegates all the feature extraction parallelization to it. - -Here is the list of proof-of-concept projects which are built using ml-core: - -* [vecino](https://github.com/src-d/vecino) - finding similar repositories. -* [tmsc](https://github.com/src-d/tmsc) - listing topics of a repository. -* [snippet-ranger](https://github.com/src-d/snippet-ranger) - topic modeling of source code snippets. -* [apollo](https://github.com/src-d/apollo) - source code deduplication at scale. - -## Installation - -Whether you wish to include Spark in your installation or would rather use an existing -installation, to use `sourced-ml` you will need to have some native libraries installed, -e.g. on Ubuntu you must first run: `apt install libxml2-dev libsnappy-dev`. [Tensorflow](https://tensorflow.org) -is also a requirement - we support both the CPU and GPU version. -In order to select which version you want, modify the package name in the next section -to either `sourced-ml[tf]` or `sourced-ml[tf-gpu]` depending on your choice. -**If you don't, neither version will be installed.** - -## Docker image - -```text -docker run -it --rm srcd/ml --help -``` - -If this first command fails with - -```text -Cannot connect to the Docker daemon. Is the docker daemon running on this host? -``` - -And you are sure that the daemon is running, then you need to add your user to `docker` group: refer to the [documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user). - -## Contributions - -...are welcome! See [CONTRIBUTING](contributing.md) and [CODE\_OF\_CONDUCT.md](code_of_conduct.md). - -## License - -[Apache 2.0](license.md) - -## Algorithms - -#### Identifier embeddings - -We build the source code identifier co-occurrence matrix for every repository. - -1. Read Git repositories. -2. Classify files using [enry](https://github.com/src-d/enry). -3. Extract [UAST](https://doc.bblf.sh/uast/specification.html) from each supported file. -4. [Split and stem](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/algorithms/token_parser.py) all the identifiers in each tree. -5. [Traverse UAST](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/transformers/coocc.py), collapse all non-identifier paths and record all - identifiers on the same level as co-occurring. Besides, connect them with their immediate parents. - -6. Write the global co-occurrence matrix. -7. Train the embeddings using [Swivel](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/algorithms/swivel.py) \(requires Tensorflow\). Interactively view - the intermediate results in Tensorboard using `--logs`. - -8. Write the identifier embeddings model. - -1-5 is performed with `repos2coocc` command, 6 with `id2vec_preproc`, 7 with `id2vec_train`, 8 with `id2vec_postproc`. - -#### Weighted Bag of X - -We represent every repository as a weighted bag-of-vectors, provided by we've got document frequencies \("docfreq"\) and identifier embeddings \("id2vec"\). - -1. Clone or read the repository from disk. -2. Classify files using [enry](https://github.com/src-d/enry). -3. Extract [UAST](https://doc.bblf.sh/uast/specification.html) from each supported file. -4. Extract various features from each tree, e.g. identifiers, literals or node2vec-like structural fingerprints. -5. Group by repository, file or function. -6. Set the weight of each such feature according to TF-IDF. -7. Write the BOW model. - -1-7 are performed with `repos2bow` command. - -#### Topic modeling - -See [here](doc/topic_modeling.md). - -## Glossary - -See [here](GLOSSARY.md). +Library for machine learning on source code. Provides commonly used algorithms and tools + to process the code-related data, such as: Babelfish's UASTs, plain code text, etc. \ No newline at end of file diff --git a/SUMMARY.md b/SUMMARY.md deleted file mode 100644 index 9824e7f..0000000 --- a/SUMMARY.md +++ /dev/null @@ -1,16 +0,0 @@ -# Table of contents - -* [README](README.md) -* [doc](doc/README.md) - * [neural\_splitter\_arch](doc/neural_splitter_arch.md) - * [topic\_modeling](doc/topic_modeling.md) - * [cmd](doc/cmd/README.md) - * [Preprocrepos command](doc/cmd/preprocrepos.md) - * [README](doc/proposals/README.md) - * [MLIP-000](doc/proposals/mlip-000.md) - * [spark](doc/spark.md) -* [LICENSE](license.md) -* [MAINTAINERS](maintainers.md) -* [CODE\_OF\_CONDUCT](code_of_conduct.md) -* [CONTRIBUTING](contributing.md) - diff --git a/contributing.md b/contributing.md index 1950215..458c5b9 100644 --- a/contributing.md +++ b/contributing.md @@ -1,6 +1,7 @@ # CONTRIBUTING -sourced.ml project is [Apache licensed](license.md) and accepts contributions via GitHub pull requests. This document outlines some of the conventions on development workflow, commit message formatting, contact points, and other resources to make it easier to get your contribution accepted. +ml-core project is [Apache licensed](license.md) and accepts contributions via GitHub pull +requests. This document outlines some of the conventions on development workflow, commit message formatting, contact points, and other resources to make it easier to get your contribution accepted. ## Certificate of Origin diff --git a/maintainers.md b/maintainers.md index 2404e99..ca60267 100644 --- a/maintainers.md +++ b/maintainers.md @@ -2,4 +2,4 @@ Vadim Markovtsev [vadim@sourced.tech](mailto:vadim@sourced.tech) \(@vmarkovtsev\) -Guillem Duran [guillem@sourced.tech](mailto:vadim@sourced.tech) \(@guillemdb) \ No newline at end of file +Guillem Duran [guillem@sourced.tech](mailto:vadim@sourced.tech) \(@guillemdb\) \ No newline at end of file diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 0000000..e89a272 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,7 @@ +flake8==3.5.0 +flake8-bugbear==18.8.0 +flake8-docstrings==1.3.0 +flake8-import-order==0.18.1 +flake8-quotes==1.0.0 +flake8-per-file-ignores==0.8.1 +pylint==2.3.1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7cf3b48 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +Cython>=0.28,<1.0; python_version == '3.7' +PyStemmer==1.3.0 +bblfsh==2.12.7 +modelforge==0.12.1 +numpy==1.16.2 +humanize==0.5.1 +pygments==2.3.1 +keras==2.2.4 +scikit-learn==0.20.3 +tqdm==4.31.1 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e6b0aab --- /dev/null +++ b/setup.py @@ -0,0 +1,73 @@ +from importlib.machinery import SourceFileLoader +import io +import os.path + +from setuptools import find_packages, setup + +sourcedml = SourceFileLoader("sourced-ml-core", "./sourced/ml/core/__init__.py").load_module() + +with io.open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8") as f: + long_description = f.read() + +tf_requires = ["tensorflow>=1.0,<2.0"] +tf_gpu_requires = ["tensorflow-gpu>=1.0,<2.0"] +exclude_packages = ( + ("sourced.ml.core.tests", "sourced.ml.core.tests.source") + if not os.getenv("ML_CORE_SETUP_INCLUDE_TESTS", False) + else () +) + + +setup( + name="sourced-ml-core", + description="Framework for machine learning on source code. " + "Provides API and tools to train and use models based " + "on source code features extracted from Babelfish's UASTs.", + long_description=long_description, + long_description_content_type="text/markdown", + version=sourcedml.__version__, + license="Apache 2.0", + author="source{d}", + author_email="machine-learning@sourced.tech", + url="https://github.com/src-d/ml-core", + download_url="https://github.com/src-d/ml-core", + packages=find_packages(exclude=exclude_packages), + keywords=[ + "machine learning on source code", + "word2vec", + "id2vec", + "github", + "swivel", + "bow", + "bblfsh", + "babelfish", + ], + install_requires=[ + "PyStemmer>=1.3,<2.0", + "bblfsh>=2.2.1,<3.0", + "modelforge>=0.12.1,<0.13", + "humanize>=0.5.0,<0.6", + "pygments>=2.2.0,<3.0", + "keras>=2.0,<3.0", + "scikit-learn>=0.19,<1.0", + "tqdm>=4.20,<5.0", + ], + extras_require={"tf": tf_requires, "tf_gpu": tf_gpu_requires}, + tests_require=["docker>=3.6.0,<4.0"], + package_data={ + "": ["LICENSE.md", "README.md"], + "sourced.ml.core.tests": ["./asdf/*.asdf", "./swivel/*", "identifiers.csv.tar.gz"], + }, + python_requires=">=3.5", + classifiers=[ + "Development Status :: 3 - Alpha", + "Environment :: Console", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Operating System :: POSIX", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/sourced/__init__.py b/sourced/__init__.py new file mode 100644 index 0000000..9b44dd0 --- /dev/null +++ b/sourced/__init__.py @@ -0,0 +1 @@ +"""MLonCode Core Library.""" diff --git a/sourced/ml/__init__.py b/sourced/ml/__init__.py new file mode 100644 index 0000000..9b44dd0 --- /dev/null +++ b/sourced/ml/__init__.py @@ -0,0 +1 @@ +"""MLonCode Core Library.""" diff --git a/sourced/ml/core/__init__.py b/sourced/ml/core/__init__.py new file mode 100644 index 0000000..588ea1a --- /dev/null +++ b/sourced/ml/core/__init__.py @@ -0,0 +1,9 @@ +"""MLonCode Core Library.""" +try: + import modelforge.configuration + + modelforge.configuration.refresh() +except ImportError: + pass + +__version__ = "0.0.1" diff --git a/sourced/ml/core/algorithms/__init__.py b/sourced/ml/core/algorithms/__init__.py new file mode 100644 index 0000000..992aff6 --- /dev/null +++ b/sourced/ml/core/algorithms/__init__.py @@ -0,0 +1,10 @@ +# flake8: noqa +from sourced.ml.core.algorithms.tf_idf import log_tf_log_idf +from sourced.ml.core.algorithms.token_parser import TokenParser, NoopTokenParser +from sourced.ml.core.algorithms.uast.ids_to_bag import UastIds2Bag, uast2sequence +from sourced.ml.core.algorithms.uast.struct_to_bag import UastRandomWalk2Bag, UastSeq2Bag +from sourced.ml.core.algorithms.uast.inttypes_to_nodes import Uast2QuantizedChildren +from sourced.ml.core.algorithms.uast.inttypes_to_graphlets import Uast2GraphletBag +from sourced.ml.core.algorithms.uast.to_role_id_pairs import Uast2RoleIdPairs +from sourced.ml.core.algorithms.uast.id_distance import Uast2IdLineDistance, Uast2IdTreeDistance +from sourced.ml.core.algorithms.uast.to_id_sequence import Uast2IdSequence diff --git a/sourced/ml/core/algorithms/id_embedding.py b/sourced/ml/core/algorithms/id_embedding.py new file mode 100644 index 0000000..2a8ac40 --- /dev/null +++ b/sourced/ml/core/algorithms/id_embedding.py @@ -0,0 +1,42 @@ +import numpy + + +def extract_coocc_matrix(global_shape, word_indices, model): + # Stage 1 - extract the tokens, map them to the global vocabulary + indices = [] + mapped_indices = [] + for i, w in enumerate(model.tokens): + gi = word_indices.get(w) + if gi is not None: + indices.append(i) + mapped_indices.append(gi) + indices = numpy.array(indices) + mapped_indices = numpy.array(mapped_indices) + # Stage 2 - sort the matched tokens by the index in the vocabulary + order = numpy.argsort(mapped_indices) + indices = indices[order] + mapped_indices = mapped_indices[order] + # Stage 3 - produce the csr_matrix with the matched tokens **only** + matrix = model.matrix.tocsr()[indices][:, indices] + # Stage 4 - convert this matrix to the global (ccmatrix) coordinates + csr_indices = matrix.indices + for i, v in enumerate(csr_indices): + # Here we use the fact that indices and mapped_indices are in the same order + csr_indices[i] = mapped_indices[v] + csr_indptr = matrix.indptr + new_indptr = [0] + for i, v in enumerate(mapped_indices): + prev_ptr = csr_indptr[i] + ptr = csr_indptr[i + 1] + + # Handle missing rows + prev = (mapped_indices[i - 1] + 1) if i > 0 else 0 + for _ in range(prev, v): + new_indptr.append(prev_ptr) + + new_indptr.append(ptr) + for _ in range(mapped_indices[-1] + 1, global_shape[0]): + new_indptr.append(csr_indptr[-1]) + matrix.indptr = numpy.array(new_indptr) + matrix._shape = global_shape + return matrix diff --git a/sourced/ml/core/algorithms/id_splitter/README.md b/sourced/ml/core/algorithms/id_splitter/README.md new file mode 100644 index 0000000..695225b --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/README.md @@ -0,0 +1,128 @@ +# Neural Identifier Splitter +Article [Splitting source code identifiers using Bidirectional LSTM Recurrent Neural Network](https://arxiv.org/abs/1805.11651). + +### Agenda +* Data +* Training pipeline +* How to launch + +### Data +You can download the dataset [here](https://drive.google.com/open?id=1wZR5zF1GL1fVcA1gZuAN_9rSLd5ssqKV). More information about the dataset is available [here](https://github.com/src-d/datasets/tree/master/Identifiers). +#### Data format +* format of file: `.csv.gz`. +* the `csv` structure: + +|num_files|num_occ|num_repos|token|token_split| +|:--|:--|:--|:--|:--| +|1|2|1|quesesSet|queses set| +|...|...|...|...|...| + +#### Data stats +* 49 millions of identifiers +* 1 GB + +### Training pipeline +Training pipeline consists of several steps +* [prepare features](https://github.com/src-d/ml/blob/master/sourced/ml/algorithms/id_splitter/features.py#L44-#L118) - read data, extract features, train/test split +* [prepare generators for keras](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L34-#L48) +* [prepare model - RNN or CNN](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L53-#L76) +* [training](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L78-#L89) +* [quality report and save the model](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L91-#L96) + +### How to launch +First of all you need to download data using link above. + +Usage: +```console +usage: srcml train-id-split [-h] -i INPUT [-e EPOCHS] [-b BATCH_SIZE] + [-l LENGTH] -o OUTPUT [-t TEST_RATIO] + [-p {pre,post}] [--optimizer {RMSprop,Adam}] + [--lr LR] [--final-lr FINAL_LR] + [--samples-before-report SAMPLES_BEFORE_REPORT] + [--val-batch-size VAL_BATCH_SIZE] [--seed SEED] + [--devices DEVICES] + [--csv-identifier CSV_IDENTIFIER] + [--csv-identifier-split CSV_IDENTIFIER_SPLIT] + [--include-csv-header] --model {RNN,CNN} + [-s STACK] + [--type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}] + [-n NEURONS] [-f FILTERS] [-k KERNEL_SIZES] + [--dim-reduction DIM_REDUCTION] + +optional arguments: + -h, --help show this help message and exit + -i INPUT, --input INPUT + Path to the input data in CSV + format:num_files,num_occ,num_repos,token,token_split + -e EPOCHS, --epochs EPOCHS + Number of training epochs. The more the betterbut the + training time is proportional. (default: 10) + -b BATCH_SIZE, --batch-size BATCH_SIZE + Batch size. Higher values better utilize GPUsbut may + harm the convergence. (default: 500) + -l LENGTH, --length LENGTH + RNN sequence length. (default: 40) + -o OUTPUT, --output OUTPUT + Path to store the trained model. + -t TEST_RATIO, --test-ratio TEST_RATIO + Fraction of the dataset to use for evaluation. + (default: 0.2) + -p {pre,post}, --padding {pre,post} + Whether to pad before or after each sequence. + (default: post) + --optimizer {RMSprop,Adam} + Algorithm to use as an optimizer for the neural net. + (default: Adam) + --lr LR Initial learning rate. (default: 0.001) + --final-lr FINAL_LR Final learning rate. The decrease from the initial + learning rate is done linearly. (default: 1e-05) + --samples-before-report SAMPLES_BEFORE_REPORT + Number of samples between each validation reportand + training updates. (default: 5000000) + --val-batch-size VAL_BATCH_SIZE + Batch size for validation.It can be increased to speed + up the pipeline butit proportionally increases the + memory consumption. (default: 2000) + --seed SEED Random seed. (default: 1989) + --devices DEVICES Device(s) to use. '-1' means CPU. (default: 0) + --csv-identifier CSV_IDENTIFIER + Column name in the CSV file for the raw identifier. + (default: 3) + --csv-identifier-split CSV_IDENTIFIER_SPLIT + Column name in the CSV file for the splitidentifier. + (default: 4) + --include-csv-header Treat the first line of the input CSV as a + regularline. (default: False) + --model {RNN,CNN} Neural Network model to use to learn the + identifiersplitting task. + -s STACK, --stack STACK + Number of layers stacked on each other. (default: 2) + --type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU} + Recurrent layer type to use. (default: LSTM) + -n NEURONS, --neurons NEURONS + Number of neurons on each layer. (default: 256) + -f FILTERS, --filters FILTERS + Number of filters for each kernel size. (default: + 64,32,16,8) + -k KERNEL_SIZES, --kernel-sizes KERNEL_SIZES + Sizes for sliding windows. (default: 2,4,8,16) + --dim-reduction DIM_REDUCTION + Number of 1-d kernels to reduce dimensionalityafter + each layer. (default: 32) +``` + + +Examples of commands: +1) Train RNN with LSTM cells +```console +srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output +``` +2) Train RNN with CuDNNLSTM cells +```console +srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output \ +--type-cell CuDNNLSTM +``` +3) Train CNN +```console +srcml train-id-split --model CNN --input /path/to/input.csv.gz --output /path/to/output +``` diff --git a/sourced/ml/core/algorithms/id_splitter/__init__.py b/sourced/ml/core/algorithms/id_splitter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/algorithms/id_splitter/features.py b/sourced/ml/core/algorithms/id_splitter/features.py new file mode 100644 index 0000000..74e91d7 --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/features.py @@ -0,0 +1,143 @@ +import logging +import string +import tarfile +from typing import List, Tuple + +from modelforge.progress_bar import progress_bar +import numpy + + +def read_identifiers( + csv_path: str, + use_header: bool, + max_identifier_len: int, + identifier_col: int, + split_identifier_col: int, + shuffle: bool = True, +) -> List[str]: + """ + Reads and filters too long identifiers in the CSV file. + + :param csv_path: path to the CSV file. + :param use_header: uses header as normal line (True) or treat as header line with column names. + :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer. + :param identifier_col: column name in the CSV file for the raw identifier. + :param split_identifier_col: column name in the CSV file for the split identifier lowercase. + :param shuffle: indicates whether to reorder the list of identifiers + at random after reading it. + :return: list of split identifiers. + """ + log = logging.getLogger("read_identifiers") + log.info("Reading data from the CSV file %s", csv_path) + identifiers = [] + # TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done + # Think about dataset download step + with tarfile.open(csv_path, encoding="utf-8") as f: + assert len(f.members) == 1, "One archived file is expected, got: %s" % len(f.members) + content = f.extractfile(f.members[0]) + if not use_header: + content.readline() + for line in progress_bar(content.readlines(), log): + row = line.decode("utf-8").strip().split(",") + if len(row[identifier_col]) <= max_identifier_len: + identifiers.append(row[split_identifier_col]) + if shuffle: + numpy.random.shuffle(identifiers) + log.info("Number of identifiers after filtering: %s." % len(identifiers)) + return identifiers + + +def prepare_features( + csv_path: str, + use_header: bool, + max_identifier_len: int, + identifier_col: int, + split_identifier_col: int, + test_ratio: float, + padding: str, + shuffle: bool = True, +) -> Tuple[numpy.ndarray]: + """ + Prepare the features to train the identifier splitting task. + + :param csv_path: path to the CSV file. + :param use_header: uses header as normal line (True) or treat as header line with column names. + :param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer. + :param identifier_col: column in the CSV file for the raw identifier. + :param split_identifier_col: column in the CSV file for the split identifier. + :param shuffle: indicates whether to reorder the list of identifiers + at random after reading it. + :param test_ratio: Proportion of test samples used for evaluation. + :param padding: position where to add padding values: + after the intput sequence if "post", before if "pre". + :return: training and testing features to train the neural net for the splitting task. + """ + from keras.preprocessing.sequence import pad_sequences + + log = logging.getLogger("prepare_features") + + # read data from the input file + identifiers = read_identifiers( + csv_path=csv_path, + use_header=use_header, + max_identifier_len=max_identifier_len, + identifier_col=identifier_col, + split_identifier_col=split_identifier_col, + shuffle=shuffle, + ) + + log.info("Converting identifiers to character indices") + log.info( + "Number of identifiers: %d, Average length: %d characters" + % (len(identifiers), numpy.mean([len(i) for i in identifiers])) + ) + + char2ind = {c: i + 1 for i, c in enumerate(sorted(string.ascii_lowercase))} + + char_id_seq = [] + splits = [] + for identifier in identifiers: + # iterate through the identifier and convert to array of char indices & boolean split array + index_arr = [] + split_arr = [] + skip_char = False + for char in identifier.strip(): + if char in char2ind: + index_arr.append(char2ind[char]) + if skip_char: + skip_char = False + continue + split_arr.append(0) + elif char == " ": + split_arr.append(1) + skip_char = True + else: + log.warning("Unexpected symbol %s in identifier", char) + assert len(index_arr) == len(split_arr) + char_id_seq.append(index_arr) + splits.append(split_arr) + + log.info( + "Number of subtokens: %d, Number of distinct characters: %d" + % ( + sum(sum(split_arr) for split_arr in splits) + len(identifiers), + len({i for index_arr in char_id_seq for i in index_arr}), + ) + ) + + log.info("Train/test splitting...") + n_train = int((1 - test_ratio) * len(char_id_seq)) + X_train = char_id_seq[:n_train] + X_test = char_id_seq[n_train:] + y_train = splits[:n_train] + y_test = splits[n_train:] + log.info( + "Number of train samples: %s, number of test samples: %s" % (len(X_train), len(X_test)) + ) + log.info("Padding the sequences...") + X_train = pad_sequences(X_train, maxlen=max_identifier_len, padding=padding) + X_test = pad_sequences(X_test, maxlen=max_identifier_len, padding=padding) + y_train = pad_sequences(y_train, maxlen=max_identifier_len, padding=padding) + y_test = pad_sequences(y_test, maxlen=max_identifier_len, padding=padding) + + return X_train, X_test, y_train[:, :, None], y_test[:, :, None] diff --git a/sourced/ml/core/algorithms/id_splitter/nn_model.py b/sourced/ml/core/algorithms/id_splitter/nn_model.py new file mode 100644 index 0000000..6fdf3ba --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/nn_model.py @@ -0,0 +1,275 @@ +import string +from typing import Callable, List, Tuple, Union +import warnings + +import keras +from keras import backend as kbackend +from keras.layers import ( + BatchNormalization, + Concatenate, + Conv1D, + Dense, + Embedding, + Input, + TimeDistributed, +) +from keras.models import Model +import numpy + +try: + import tensorflow as tf +except ImportError: + warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.") + + +LOSS = "binary_crossentropy" +METRICS = ["accuracy"] +# Number of unique characters and dimension of the embedding layer +NUM_CHARS = len(string.ascii_lowercase) + + +def register_metric(metric: Union[str, Callable]) -> Union[str, Callable]: + """ + Decorator function to register the metrics in the METRICS constant. + + :param metric: name of the tensorflow metric or custom function metric. + :return: the metric. + """ + assert isinstance(metric, str) or callable(metric) + METRICS.append(metric) + return metric + + +def prepare_devices(devices: str) -> Tuple[str]: + """ + Extract devices from arguments. + + :param devices: devices to use passed as one string argument. + :return: split devices. + """ + devices = devices.split(",") + if len(devices) == 2: + dev0, dev1 = ("/gpu:" + dev for dev in devices) + elif len(devices) == 1: + if int(devices[0]) != -1: + dev0 = dev1 = "/gpu:" + devices[0] + else: + dev0 = dev1 = "/cpu:0" + else: + raise ValueError( + "Expected 1 or 2 devices but got %d from the devices argument %s" + % (len(devices), devices) + ) + return dev0, dev1 + + +def prepare_input_emb(maxlen: int) -> Tuple[tf.Tensor]: + """ + Builds character embeddings, a dense representation of characters to feed the RNN with. + + :param maxlen: maximum length of the input sequence. + :return: input and one-hot character embedding layer. + """ + char_seq = Input((maxlen,)) + emb = Embedding( + input_dim=NUM_CHARS + 1, + output_dim=NUM_CHARS + 1, + input_length=maxlen, + mask_zero=False, + weights=[numpy.eye(NUM_CHARS + 1)], + trainable=False, + )(char_seq) + return char_seq, emb + + +def add_output_layer(hidden_layer: tf.Tensor) -> keras.layers.wrappers.TimeDistributed: + """ + Applies a Dense layer to each of the timestamps of a hidden layer, independently. + The output layer has 1 sigmoid per character which predicts if there is a space or not + before the character. + + :param hidden_layer: hidden layer before the output layer. + :return: output layer. + """ + norm_input = BatchNormalization()(hidden_layer) + return TimeDistributed(Dense(1, activation="sigmoid"))(norm_input) + + +def add_rnn( + X: tf.Tensor, units: int, rnn_layer: str, dev0: str = "/gpu:0", dev1: str = "/gpu:1" +) -> tf.Tensor: + """ + Adds a bidirectional RNN layer with the specified parameters. + + :param X: input layer. + :param units: number of neurons in the output layer. + :param rnn_layer: type of cell in the RNN. + :param dev0: device that will be used as forward pass of RNN and concatenation. + :param dev1: device that will be used as backward pass. + :return: output bidirectional RNN layer. + """ + # select the type of RNN layer + rnn_layer = getattr(keras.layers, rnn_layer) + + # add the forward & backward RNN + with tf.device(dev0): + forward = rnn_layer(units=units, return_sequences=True)(X) + with tf.device(dev1): + backward = rnn_layer(units=units, return_sequences=True, go_backwards=True)(X) + + # concatenate + with tf.device(dev1): + bidi = Concatenate(axis=-1)([forward, backward]) + return bidi + + +def build_rnn( + maxlen: int, units: int, stack: int, optimizer: str, dev0: str, dev1: str, rnn_layer: str +) -> keras.engine.training.Model: + """ + Builds a RNN model with the parameters specified as arguments. + + :param maxlen: maximum length of the input sequence. + :param units: number of neurons or dimensionality of the output RNN. + :param stack: number of RNN layers to stack. + :param optimizer: algorithm to use as an optimizer for the RNN. + :param rnn_layer: recurrent layer type to use. + :param dev0: first device to use when running specific operations. + :param dev1: second device to use when running specific operations. + :return: compiled RNN model. + """ + # prepare the model + with tf.device(dev0): + char_seq, hidden_layer = prepare_input_emb(maxlen) + + # stack the BiDi-RNN layers + for _ in range(stack): + hidden_layer = add_rnn( + hidden_layer, units=units, rnn_layer=rnn_layer, dev0=dev0, dev1=dev1 + ) + output = add_output_layer(hidden_layer) + + # compile the model + model = Model(inputs=char_seq, outputs=output) + model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS) + return model + + +def add_conv( + X: tf.Tensor, filters: List[int], kernel_sizes: List[int], output_n_filters: int +) -> tf.Tensor: + """ + Builds a single convolutional layer. + + :param X: input layer. + :param filters: number of output filters in the convolution. + :param kernel_sizes: list of lengths of the 1D convolution window. + :param output_n_filters: number of 1D output filters. + :return: output layer. + """ + # normalize the input + X = BatchNormalization()(X) + + # add convolutions + convs = [] + + for n_filters, kernel_size in zip(filters, kernel_sizes): + conv = Conv1D( + filters=n_filters, kernel_size=kernel_size, padding="same", activation="relu" + ) + convs.append(conv(X)) + + # concatenate all convolutions + conc = Concatenate(axis=-1)(convs) + conc = BatchNormalization()(conc) + + # dimensionality reduction + conv = Conv1D(filters=output_n_filters, kernel_size=1, padding="same", activation="relu") + return conv(conc) + + +def build_cnn( + maxlen: int, + filters: List[int], + output_n_filters: int, + stack: int, + kernel_sizes: List[int], + optimizer: str, + device: str, +) -> keras.engine.training.Model: + """ + Builds a CNN model with the parameters specified as arguments. + + :param maxlen: maximum length of the input sequence. + :param filters: number of output filters in the convolution. + :param output_n_filters: number of 1d output filters. + :param stack: number of CNN layers to stack. + :param kernel_sizes: list of lengths of the 1D convolution window. + :param optimizer: algorithm to use as an optimizer for the CNN. + :param device: device to use when running specific operations. + :return: compiled CNN model. + """ + # prepare the model + with tf.device(device): + char_seq, hidden_layer = prepare_input_emb(maxlen) + + # stack the CNN layers + for _ in range(stack): + hidden_layer = add_conv( + hidden_layer, + filters=filters, + kernel_sizes=kernel_sizes, + output_n_filters=output_n_filters, + ) + output = add_output_layer(hidden_layer) + + # compile the model + model = Model(inputs=char_seq, outputs=output) + model.compile(optimizer=optimizer, loss=LOSS, metrics=METRICS) + return model + + +@register_metric +def precision(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the precision, a metric for multi-label classification of + how many selected items are relevant. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of precision. + """ + true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1))) + predicted_positives = kbackend.sum(kbackend.round(kbackend.clip(y_pred, 0, 1))) + precision = true_positives / (predicted_positives + kbackend.epsilon()) + return precision + + +@register_metric +def recall(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the recall, a metric for multi-label classification of + how many relevant items are selected. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of recall. + """ + true_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true * y_pred, 0, 1))) + possible_positives = kbackend.sum(kbackend.round(kbackend.clip(y_true, 0, 1))) + recall = true_positives / (possible_positives + kbackend.epsilon()) + return recall + + +@register_metric +def f1score(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor: + """ + Computes the F1 score, the harmonic average of precision and recall. + + :param y_true: tensor of true labels. + :param y_pred: tensor of predicted labels. + :return: a tensor batch-wise average of F1 score. + """ + prec = precision(y_true, y_pred) + rec = recall(y_true, y_pred) + return 2 * prec * rec / (prec + rec + kbackend.epsilon()) diff --git a/sourced/ml/core/algorithms/id_splitter/pipeline.py b/sourced/ml/core/algorithms/id_splitter/pipeline.py new file mode 100644 index 0000000..d4fe564 --- /dev/null +++ b/sourced/ml/core/algorithms/id_splitter/pipeline.py @@ -0,0 +1,236 @@ +from datetime import datetime +import logging +import os +import random +from typing import Callable, Iterable, List, Tuple +import warnings + +import keras +from keras import backend as kbackend +from keras.callbacks import CSVLogger, LearningRateScheduler, ModelCheckpoint, TensorBoard +import numpy + +try: + import tensorflow as tf +except ImportError: + warnings.warn("Tensorflow is not installed, dependent functionality is unavailable.") + + +# additional variable to avoid any division by zero when computing the precision and recall metrics +EPSILON = 10 ** -8 +# threshold that is used to binarize predictions of the model +DEFAULT_THRESHOLD = 0.5 + + +def set_random_seed(seed: int) -> None: + """ + Fixes a random seed for reproducibility. + + :param seed: seed value. + """ + numpy.random.seed(seed) + random.seed(seed) + tf.set_random_seed(seed) + + +def binarize(matrix: numpy.array, threshold: float, inplace: bool = True) -> numpy.array: + """ + Helper function to binarize a matrix. + + :param matrix: matrix as a numpy.array. + :param threshold: if value >= threshold then the value will be 1, else 0. + :param inplace: whether to modify the matrix inplace or not. + :return: the binarized matrix. + """ + mask = matrix >= threshold + if inplace: + matrix_ = matrix + else: + matrix_ = matrix.copy() + matrix_[mask] = 1 + matrix_[numpy.logical_not(mask)] = 0 + return matrix_ + + +def str2ints(params: str) -> List[int]: + """ + Convert a string with integer parameters to a list of integers. + + :param params: string that contains integer parameters separated by commas. + :return: list of integers. + """ + return list(map(int, params.split(","))) + + +def precision_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float: + """ + Computes the precision metric, a metric for multi-label classification of + how many selected items are relevant. + + :param y_true: ground truth labels - expect binary values. + :param y_pred: predicted labels - expect binary values. + :param epsilon: added to the denominator to avoid any division by zero. + :return: precision metric. + """ + true_positives = numpy.sum(y_true * y_pred) + predicted_positives = numpy.sum(y_pred) + return true_positives / (predicted_positives + epsilon) + + +def recall_np(y_true: numpy.array, y_pred: numpy.array, epsilon: float = EPSILON) -> float: + """ + Computes the recall metric, a metric for multi-label classification of + how many relevant items are selected. + + :param y_true: matrix with ground truth labels - expect binary values. + :param y_pred: matrix with predicted labels - expect binary values. + :param epsilon: added to the denominator to avoid any division by zero. + :return: recall metric. + """ + true_positives = numpy.sum(y_true * y_pred) + possible_positives = numpy.sum(y_true) + return true_positives / (possible_positives + epsilon) + + +def report( + model: keras.engine.training.Model, + X: numpy.array, + y: numpy.array, + batch_size: int, + threshold: float = DEFAULT_THRESHOLD, + epsilon: float = EPSILON, +) -> None: + """ + Prints a metric report of the `model` on the data `X` & `y`. + The metrics printed are precision, recall, F1 score. + + :param model: model considered. + :param X: features. + :param y: labels (expected binary labels). + :param batch_size: batch size that will be used for prediction. + :param threshold: threshold to binarize the predictions. + :param epsilon: added to the denominator to avoid any division by zero. + """ + log = logging.getLogger("report") + + # predict & skip the last dimension & binarize + predictions = model.predict(X, batch_size=batch_size, verbose=1)[:, :, 0] + predictions = binarize(predictions, threshold) + + # report + pr = precision_np(y[:, :, 0], predictions, epsilon=epsilon) + rec = recall_np(y[:, :, 0], predictions, epsilon=epsilon) + f1 = 2 * pr * rec / (pr + rec + epsilon) + log.info("precision: %.3f, recall: %.3f, f1: %.3f" % (pr, rec, f1)) + + +def config_keras() -> None: + """ + Initializes keras backend session. + """ + config = tf.ConfigProto() + config.gpu_options.allow_growth = True + kbackend.tensorflow_backend.set_session(tf.Session(config=config)) + + +def build_train_generator( + X: numpy.array, y: numpy.array, batch_size: int = 500 +) -> Iterable[Tuple[numpy.array]]: + """ + Builds the generator that yields features and their labels. + + :param X: features. + :param y: binary labels. + :param batch_size: higher values better utilize GPUs. + :return: generator of features and their labels. + """ + assert X.shape[0] == y.shape[0], "Number of samples mismatch in X and y." + + def xy_generator(): + while True: + n_batches = X.shape[0] // batch_size + if n_batches * batch_size < X.shape[0]: + n_batches += 1 # to yield last samples + for i in range(n_batches): + start = i * batch_size + end = min((i + 1) * batch_size, X.shape[0]) + yield X[start:end], y[start:end] + + return xy_generator() + + +def build_schedule(lr: float, final_lr: float, n_epochs: int) -> Callable: + """ + Builds the schedule of which the learning rate decreases. + The schedule makes the learning rate decrease linearly. + + :param lr: initial learning rate. + :param final_lr: final learning rate. + :param n_epochs: number of training epochs. + :return: the schedule of the learning rate. + """ + delta = (lr - final_lr) / n_epochs + + def schedule(epoch: int) -> float: + assert 0 <= epoch < n_epochs + return lr - delta * epoch + + return schedule + + +def make_lr_scheduler( + lr: float, final_lr: float, n_epochs: int, verbose: int = 1 +) -> keras.callbacks.LearningRateScheduler: + """ + Prepares the scheduler to decrease the learning rate while training. + + :param lr: initial learning rate. + :param final_lr: final learning rate. + :param n_epochs: number of training epochs. + :param verbose: level of verbosity. + :return: LearningRateScheduler with linear schedule of the learning rate. + """ + schedule = build_schedule(lr, final_lr, n_epochs) + return LearningRateScheduler(schedule=schedule, verbose=verbose) + + +def prepare_callbacks(output_dir: str) -> Tuple[Callable]: + """ + Prepares logging, tensorboard, model checkpoint callbacks and stores the outputs in output_dir. + + :param output_dir: path to the results. + :return: list of callbacks. + """ + time = datetime.now().strftime("%y%m%d-%H%M") + log_dir = os.path.join(output_dir, "tensorboard" + time) + logging.info("Tensorboard directory: %s" % log_dir) + tensorboard = TensorBoard( + log_dir=log_dir, batch_size=1000, write_images=True, write_graph=True + ) + csv_path = os.path.join(output_dir, "csv_logger_" + time + ".txt") + logging.info("CSV logs: %s" % csv_path) + csv_logger = CSVLogger(csv_path) + + filepath = os.path.join(output_dir, "best_" + time + ".model") + model_saver = ModelCheckpoint( + filepath, monitor="val_recall", verbose=1, save_best_only=True, mode="max" + ) + return tensorboard, csv_logger, model_saver + + +def create_generator_params( + batch_size: int, samples_per_epoch: int, n_samples: int, epochs: int +) -> Tuple[int]: + """ + Helper function to split a huge dataset into smaller ones to enable more frequent reports. + + :param batch_size: batch size. + :param samples_per_epoch: number of samples per mini-epoch or before each report. + :param n_samples: total number of samples. + :param epochs: number of epochs over the full dataset. + :return: number of steps per epoch (should be used with the generator) and number of sub-epochs + where during sub-epoch only samples_per_epoch will be generated. + """ + steps_per_epoch = samples_per_epoch // batch_size + n_epochs = numpy.ceil(epochs * n_samples / samples_per_epoch) + return steps_per_epoch, n_epochs diff --git a/sourced/ml/core/algorithms/swivel.py b/sourced/ml/core/algorithms/swivel.py new file mode 100644 index 0000000..e8bbd00 --- /dev/null +++ b/sourced/ml/core/algorithms/swivel.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python3 +# +# Copyright 2016 Google Inc. All Rights Reserved. +# Copyright 2017 Sourced Technologies S. L. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Submatrix-wise Vector Embedding Learner. + +Implementation of SwiVel algorithm described at: +http://arxiv.org/abs/1602.02215 + +This program expects an input directory that contains the following files. + + row_vocab.txt, col_vocab.txt + + The row an column vocabulary files. Each file should contain one token per + line; these will be used to generate a tab-separate file containing the + trained embeddings. + + row_sums.txt, col_sum.txt + + The matrix row and column marginal sums. Each file should contain one + decimal floating point number per line which corresponds to the marginal + count of the matrix for that row or column. + + shards.recs + + A file containing the sub-matrix shards, stored as TFRecords. Each shard is + expected to be a serialzed tf.Example protocol buffer with the following + properties: + + global_row: the global row indices contained in the shard + global_col: the global column indices contained in the shard + sparse_local_row, sparse_local_col, sparse_value: three parallel arrays + that are a sparse representation of the submatrix counts. + +It will generate embeddings, training from the input directory for +the specified number of epochs. When complete, it will output the trained +vectors to a tab-separated file that contains one line per embedding. Row and +column embeddings are stored in separate files. + +""" + +import glob +import math +import os +import threading +import time + +import numpy +import tensorflow as tf +from tensorflow.python.client import device_lib + +flags = tf.app.flags + +flags.DEFINE_string( + "input_base_path", None, "Directory containing input shards, vocabularies, " "and marginals." +) +flags.DEFINE_string("output_base_path", None, "Path where to write the trained embeddings.") +flags.DEFINE_integer("embedding_size", 300, "Size of the embeddings") +flags.DEFINE_boolean("trainable_bias", False, "Biases are trainable") +flags.DEFINE_integer( + "submatrix_rows", + 4096, + "Rows in each training submatrix. This must match " "the training data.", +) +flags.DEFINE_integer( + "submatrix_cols", + 4096, + "Rows in each training submatrix. This must match " "the training data.", +) +flags.DEFINE_float("loss_multiplier", 1.0 / 4096, "constant multiplier on loss.") +flags.DEFINE_float("confidence_exponent", 0.5, "Exponent for l2 confidence function") +flags.DEFINE_float("confidence_scale", 0.25, "Scale for l2 confidence function") +flags.DEFINE_float("confidence_base", 0.1, "Base for l2 confidence function") +flags.DEFINE_float("learning_rate", 1.0, "Initial learning rate") +flags.DEFINE_string("optimizer", "Adagrad", "SGD optimizer (tf.train.*Optimizer)") +flags.DEFINE_integer("num_concurrent_steps", 2, "Number of threads to train with") +flags.DEFINE_integer("num_readers", 4, "Number of threads to read the input data and feed it") +flags.DEFINE_float("num_epochs", 40, "Number epochs to train for") +flags.DEFINE_float( + "per_process_gpu_memory_fraction", 0, "Fraction of GPU memory to use, 0 means allow_growth" +) +flags.DEFINE_integer("num_gpus", 0, "Number of GPUs to use, 0 means all available") +flags.DEFINE_string("logs", "", "Path for TensorBoard logs (empty value disables them)") + +FLAGS = flags.FLAGS + + +def log(message, *args, **kwargs): + tf.logging.info(message, *args, **kwargs) + + +def get_available_gpus(): + return [d.name for d in device_lib.list_local_devices() if d.device_type == "GPU"] + + +def embeddings_with_init(vocab_size, embedding_dim, name): + """Creates and initializes the embedding tensors.""" + return tf.get_variable( + name=name, + shape=[vocab_size, embedding_dim], + initializer=tf.random_normal_initializer(stddev=math.sqrt(1.0 / embedding_dim)), + ) + + +def count_matrix_input(filenames, submatrix_rows, submatrix_cols): + """Reads submatrix shards from disk.""" + filename_queue = tf.train.string_input_producer(filenames) + reader = tf.WholeFileReader() + _, serialized_example = reader.read(filename_queue) + features = tf.parse_single_example( + serialized_example, + features={ + "global_row": tf.FixedLenFeature([submatrix_rows], dtype=tf.int64), + "global_col": tf.FixedLenFeature([submatrix_cols], dtype=tf.int64), + "sparse_local_row": tf.VarLenFeature(dtype=tf.int64), + "sparse_local_col": tf.VarLenFeature(dtype=tf.int64), + "sparse_value": tf.VarLenFeature(dtype=tf.float32), + }, + ) + + global_row = features["global_row"] + global_col = features["global_col"] + + sparse_local_row = features["sparse_local_row"].values + sparse_local_col = features["sparse_local_col"].values + sparse_count = features["sparse_value"].values + + sparse_indices = tf.concat( + axis=1, values=[tf.expand_dims(sparse_local_row, 1), tf.expand_dims(sparse_local_col, 1)] + ) + count = tf.sparse_to_dense( + sparse_indices, [submatrix_rows, submatrix_cols], sparse_count, validate_indices=False + ) + + queued_global_row, queued_global_col, queued_count = tf.train.batch( + [global_row, global_col, count], batch_size=1, num_threads=FLAGS.num_readers, capacity=32 + ) + + queued_global_row = tf.reshape(queued_global_row, [submatrix_rows]) + queued_global_col = tf.reshape(queued_global_col, [submatrix_cols]) + queued_count = tf.reshape(queued_count, [submatrix_rows, submatrix_cols]) + + return queued_global_row, queued_global_col, queued_count + + +def read_marginals_file(filename): + """Reads text file with one number per line to an array.""" + with open(filename) as lines: + return [float(line) for line in lines] + + +def write_embedding_tensor_to_disk(vocab_path, output_path, sess, embedding): + """Writes tensor to output_path as tsv""" + # Fetch the embedding values from the model + embeddings = sess.run(embedding) + + with open(output_path, "w") as out_f: + with open(vocab_path) as vocab_f: + for index, word in enumerate(vocab_f): + word = word.strip() + embedding = embeddings[index] + out_f.write(word + "\t" + "\t".join([str(x) for x in embedding]) + "\n") + + +def write_embeddings_to_disk(config, model, sess): + """Writes row and column embeddings disk""" + # Row Embedding + row_vocab_path = config.input_base_path + "/row_vocab.txt" + row_embedding_output_path = config.output_base_path + "/row_embedding.tsv" + log("Writing row embeddings to: %s", row_embedding_output_path) + write_embedding_tensor_to_disk( + row_vocab_path, row_embedding_output_path, sess, model.row_embedding + ) + + # Column Embedding + col_vocab_path = config.input_base_path + "/col_vocab.txt" + col_embedding_output_path = config.output_base_path + "/col_embedding.tsv" + log("Writing column embeddings to: %s", col_embedding_output_path) + write_embedding_tensor_to_disk( + col_vocab_path, col_embedding_output_path, sess, model.col_embedding + ) + + +class SwivelModel: + """Small class to gather needed pieces from a Graph being built.""" + + def __init__(self, config): + """Construct graph for dmc.""" + self._config = config + + # Create paths to input data files + log("Reading model from: %s", config.input_base_path) + count_matrix_files = glob.glob(os.path.join(config.input_base_path, "shard-*.pb")) + row_sums_path = os.path.join(config.input_base_path, "row_sums.txt") + col_sums_path = os.path.join(config.input_base_path, "col_sums.txt") + + # Read marginals + row_sums = read_marginals_file(row_sums_path) + col_sums = read_marginals_file(col_sums_path) + + self.n_rows = len(row_sums) + self.n_cols = len(col_sums) + log( + "Matrix dim: (%d,%d) SubMatrix dim: (%d,%d)", + self.n_rows, + self.n_cols, + config.submatrix_rows, + config.submatrix_cols, + ) + if self.n_cols < config.submatrix_cols: + raise ValueError( + "submatrix_cols={0} can not be bigger than columns number={1} " + "(specify submatrix_cols={1})".format(config.submatrix_cols, self.n_cols) + ) + if self.n_rows < config.submatrix_rows: + raise ValueError( + "submatrix_rows={0} can not be bigger than rows number={1} " + "(specify submatrix_rows={1})".format(config.submatrix_rows, self.n_cols) + ) + self.n_submatrices = ( + self.n_rows * self.n_cols / (config.submatrix_rows * config.submatrix_cols) + ) + log("n_submatrices: %d", self.n_submatrices) + + with tf.device("/cpu:0"): + # ===== CREATE VARIABLES ====== + # Get input + global_row, global_col, count = count_matrix_input( + count_matrix_files, config.submatrix_rows, config.submatrix_cols + ) + + # Embeddings + self.row_embedding = embeddings_with_init( + embedding_dim=config.embedding_size, vocab_size=self.n_rows, name="row_embedding" + ) + self.col_embedding = embeddings_with_init( + embedding_dim=config.embedding_size, vocab_size=self.n_cols, name="col_embedding" + ) + tf.summary.histogram("row_emb", self.row_embedding) + tf.summary.histogram("col_emb", self.col_embedding) + + matrix_log_sum = math.log(numpy.sum(row_sums) + 1) + row_bias_init = [math.log(x + 1) for x in row_sums] + col_bias_init = [math.log(x + 1) for x in col_sums] + self.row_bias = tf.Variable(row_bias_init, trainable=config.trainable_bias) + self.col_bias = tf.Variable(col_bias_init, trainable=config.trainable_bias) + tf.summary.histogram("row_bias", self.row_bias) + tf.summary.histogram("col_bias", self.col_bias) + + # Add optimizer + l2_losses = [] + sigmoid_losses = [] + self.global_step = tf.Variable(0, name="global_step") + learning_rate = tf.Variable(config.learning_rate, name="learning_rate") + opt = getattr(tf.train, FLAGS.optimizer + "Optimizer")(learning_rate) + tf.summary.scalar("learning_rate", learning_rate) + + all_grads = [] + + devices = ( + ["/gpu:%d" % i for i in range(FLAGS.num_gpus)] + if FLAGS.num_gpus > 0 + else get_available_gpus() + ) + self.devices_number = len(devices) + if not self.devices_number: + devices = ["/cpu:0"] + self.devices_number = 1 + for dev in devices: + with tf.device(dev): + with tf.name_scope(dev[1:].replace(":", "_")): + # ===== CREATE GRAPH ===== + # Fetch embeddings. + selected_row_embedding = tf.nn.embedding_lookup(self.row_embedding, global_row) + selected_col_embedding = tf.nn.embedding_lookup(self.col_embedding, global_col) + + # Fetch biases. + selected_row_bias = tf.nn.embedding_lookup([self.row_bias], global_row) + selected_col_bias = tf.nn.embedding_lookup([self.col_bias], global_col) + + # Multiply the row and column embeddings to generate + # predictions. + predictions = tf.matmul( + selected_row_embedding, selected_col_embedding, transpose_b=True + ) + + # These binary masks separate zero from non-zero values. + count_is_nonzero = tf.to_float(tf.cast(count, tf.bool)) + count_is_zero = 1 - count_is_nonzero + + objectives = count_is_nonzero * tf.log(count + 1e-30) + objectives -= tf.reshape(selected_row_bias, [config.submatrix_rows, 1]) + objectives -= selected_col_bias + objectives += matrix_log_sum + + err = predictions - objectives + + # The confidence function scales the L2 loss based on + # the raw co-occurrence count. + l2_confidence = config.confidence_base + config.confidence_scale * tf.pow( + count, config.confidence_exponent + ) + + l2_loss = config.loss_multiplier * tf.reduce_sum( + 0.5 * l2_confidence * err * err * count_is_nonzero + ) + l2_losses.append(tf.expand_dims(l2_loss, 0)) + + sigmoid_loss = config.loss_multiplier * tf.reduce_sum( + tf.nn.softplus(err) * count_is_zero + ) + sigmoid_losses.append(tf.expand_dims(sigmoid_loss, 0)) + + loss = l2_loss + sigmoid_loss + grads = opt.compute_gradients(loss) + all_grads.append(grads) + + with tf.device("/cpu:0"): + # ===== MERGE LOSSES ===== + l2_loss = tf.reduce_mean(tf.concat(axis=0, values=l2_losses), 0, name="l2_loss") + sigmoid_loss = tf.reduce_mean( + tf.concat(axis=0, values=sigmoid_losses), 0, name="sigmoid_loss" + ) + overall_loss = l2_loss + sigmoid_loss + average = tf.train.ExponentialMovingAverage(0.999) + loss_average_op = average.apply((overall_loss, l2_loss, sigmoid_loss)) + self.loss = average.average(overall_loss) + tf.summary.scalar("overall_loss", self.loss) + tf.summary.scalar("l2_loss", average.average(l2_loss)) + tf.summary.scalar("sigmoid_loss", average.average(sigmoid_loss)) + + # Apply the gradients to adjust the shared variables. + apply_gradient_ops = [] + for grads in all_grads: + apply_gradient_ops.append(opt.apply_gradients(grads, global_step=self.global_step)) + + self.train_op = tf.group(loss_average_op, *apply_gradient_ops) + self.saver = tf.train.Saver(sharded=True) + + def initialize_summary(self, sess): + log("creating TensorBoard stuff...") + self.summary = tf.summary.merge_all() + self.writer = tf.summary.FileWriter(FLAGS.logs, sess.graph) + projector_config = tf.contrib.tensorboard.plugins.projector.ProjectorConfig() + embedding_config = projector_config.embeddings.add() + length = min(10000, self.n_rows, self.n_cols) + self.embedding10k = tf.Variable( + tf.zeros((length, self._config.embedding_size)), name="top10k_embedding" + ) + embedding_config.tensor_name = self.embedding10k.name + embedding_config.metadata_path = os.path.join( + self._config.input_base_path, "row_vocab.txt" + ) + tf.contrib.tensorboard.plugins.projector.visualize_embeddings( + self.writer, projector_config + ) + self.saver = tf.train.Saver((self.embedding10k,), max_to_keep=1) + + def write_summary(self, sess): + log("writing the summary...") + length = min(10000, self.n_rows, self.n_cols) + assignment = self.embedding10k.assign( + (self.row_embedding[:length] + self.col_embedding[:length]) / 2 + ) + summary, _, global_step = sess.run((self.summary, assignment, self.global_step)) + self.writer.add_summary(summary, global_step) + self.saver.save(sess, os.path.join(FLAGS.logs, "embeddings10k.checkpoint"), global_step) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + start_time = time.time() + + omitted = {"handler", "command"} + + log( + "Swivel parameters:\n" + + "\n".join( + "\t{:20} {}".format(key, value) + for key, value in sorted(FLAGS.__dict__.items()) + if key not in omitted + ) + ) + # Create the output path. If this fails, it really ought to fail now. :) + if not os.path.isdir(FLAGS.output_base_path): + os.makedirs(FLAGS.output_base_path) + + # Create and run model + with tf.Graph().as_default(): + log("creating the model...") + model = SwivelModel(FLAGS) + + # Create a session for running Ops on the Graph. + gpu_opts = {} + if FLAGS.per_process_gpu_memory_fraction > 0: + gpu_opts["per_process_gpu_memory_fraction"] = FLAGS.per_process_gpu_memory_fraction + else: + gpu_opts["allow_growth"] = True + gpu_options = tf.GPUOptions(**gpu_opts) + sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + if FLAGS.logs: + model.initialize_summary(sess) + + # Run the Op to initialize the variables. + log("initializing the variables...") + sess.run(tf.global_variables_initializer()) + + # Start feeding input + log("starting the input threads...") + coord = tf.train.Coordinator() + threads = tf.train.start_queue_runners(sess=sess, coord=coord) + + # Calculate how many steps each thread should run + n_total_steps = int(FLAGS.num_epochs * model.n_rows * model.n_cols) / ( + FLAGS.submatrix_rows * FLAGS.submatrix_cols + ) + n_steps_per_thread = n_total_steps / (FLAGS.num_concurrent_steps * model.devices_number) + n_submatrices_to_train = model.n_submatrices * FLAGS.num_epochs + t0 = [time.time()] + n_steps_between_status_updates = 100 + n_steps_between_summary_updates = 10000 + status_i = [0, 0] + status_lock = threading.Lock() + msg = ( + "%%%dd/%%d submatrices trained (%%.1f%%%%), " "%%5.1f submatrices/sec | loss %%f" + ) % len(str(n_submatrices_to_train)) + + def TrainingFn(): + for _ in range(int(n_steps_per_thread)): + _, global_step, loss = sess.run((model.train_op, model.global_step, model.loss)) + + show_status = False + update_summary = False + with status_lock: + new_i = global_step // n_steps_between_status_updates + if new_i > status_i[0]: + status_i[0] = new_i + show_status = True + new_i = global_step // n_steps_between_summary_updates + if new_i > status_i[1]: + status_i[1] = new_i + update_summary = True + if show_status: + elapsed = float(time.time() - t0[0]) + log( + msg, + global_step, + n_submatrices_to_train, + 100.0 * global_step / n_submatrices_to_train, + n_steps_between_status_updates / elapsed, + loss, + ) + t0[0] = time.time() + if update_summary and FLAGS.logs: + model.write_summary(sess) + + # Start training threads + train_threads = [] + for _ in range(FLAGS.num_concurrent_steps): + t = threading.Thread(target=TrainingFn) + train_threads.append(t) + t.start() + + # Wait for threads to finish. + for t in train_threads: + t.join() + + coord.request_stop() + coord.join(threads) + + # Write out vectors + write_embeddings_to_disk(FLAGS, model, sess) + + # Shutdown + sess.close() + log("Elapsed: %s", time.time() - start_time) + + +if __name__ == "__main__": + tf.app.run() diff --git a/sourced/ml/core/algorithms/tf_idf.py b/sourced/ml/core/algorithms/tf_idf.py new file mode 100644 index 0000000..7cbc21a --- /dev/null +++ b/sourced/ml/core/algorithms/tf_idf.py @@ -0,0 +1,5 @@ +import numpy + + +def log_tf_log_idf(tf, df, ndocs): + return numpy.log(1 + tf) * numpy.log(ndocs / df) diff --git a/sourced/ml/core/algorithms/token_parser.py b/sourced/ml/core/algorithms/token_parser.py new file mode 100644 index 0000000..5df7cb0 --- /dev/null +++ b/sourced/ml/core/algorithms/token_parser.py @@ -0,0 +1,142 @@ +import re + +import Stemmer + + +class TokenParser: + """ + Common utilities for splitting and stemming tokens. + """ + + NAME_BREAKUP_RE = re.compile(r"[^a-zA-Z]+") #: Regexp to split source code identifiers. + STEM_THRESHOLD = 6 #: We do not stem split parts shorter than or equal to this size. + MAX_TOKEN_LENGTH = 256 #: We cut identifiers longer than this value. + MIN_SPLIT_LENGTH = 3 #: We do not split source code identifiers shorter than this value. + DEFAULT_SINGLE_SHOT = False #: True if we do not want to join small identifiers to next one. + # Example: 'sourced.ml.algorithms' -> ["sourc", "sourcedml", "algorithm", "mlalgorithm"]. + # if True we have only ["sourc", "algorithm"]. + # if you do not want to filter small tokens set min_split_length=1. + + def __init__( + self, + stem_threshold=STEM_THRESHOLD, + max_token_length=MAX_TOKEN_LENGTH, + min_split_length=MIN_SPLIT_LENGTH, + single_shot=DEFAULT_SINGLE_SHOT, + ): + self._stemmer = Stemmer.Stemmer("english") + self._stemmer.maxCacheSize = 0 + self._stem_threshold = stem_threshold + self._max_token_length = max_token_length + self._min_split_length = min_split_length + self._single_shot = single_shot + + @property + def stem_threshold(self): + return self._stem_threshold + + @stem_threshold.setter + def stem_threshold(self, value): + if not isinstance(value, int): + raise TypeError("stem_threshold must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("stem_threshold must be greater than 0 - got %d" % value) + self._stem_threshold = value + + @property + def max_token_length(self): + return self._max_token_length + + @max_token_length.setter + def max_token_length(self, value): + if not isinstance(value, int): + raise TypeError("max_token_length must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("max_token_length must be greater than 0 - got %d" % value) + self._max_token_length = value + + @property + def min_split_length(self): + return self._min_split_length + + @min_split_length.setter + def min_split_length(self, value): + if not isinstance(value, int): + raise TypeError("min_split_length must be an integer - got %s" % type(value)) + if value < 1: + raise ValueError("min_split_length must be greater than 0 - got %d" % value) + self._min_split_length = value + + def __call__(self, token): + return self.process_token(token) + + def process_token(self, token): + for word in self.split(token): + yield self.stem(word) + + def stem(self, word): + if len(word) <= self.stem_threshold: + return word + return self._stemmer.stemWord(word) + + def split(self, token): + token = token.strip()[: self.max_token_length] + + def ret(name): + r = name.lower() + if len(name) >= self.min_split_length: + ret.last_subtoken = r + yield r + if ret.prev_p and not self._single_shot: + yield ret.prev_p + r + ret.prev_p = "" + elif not self._single_shot: + ret.prev_p = r + yield ret.last_subtoken + r + ret.last_subtoken = "" + + ret.prev_p = "" + ret.last_subtoken = "" + + for part in self.NAME_BREAKUP_RE.split(token): + if not part: + continue + prev = part[0] + pos = 0 + for i in range(1, len(part)): + this = part[i] + if prev.islower() and this.isupper(): + yield from ret(part[pos:i]) + pos = i + elif prev.isupper() and this.islower(): + if 0 < i - 1 - pos <= self.min_split_length: + yield from ret(part[pos:i]) + pos = i + elif i - 1 > pos: + yield from ret(part[pos:i]) + pos = i + prev = this + last = part[pos:] + if last: + yield from ret(last) + + def __getstate__(self): + state = self.__dict__.copy() + del state["_stemmer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + self._stemmer = Stemmer.Stemmer("english") + + +class NoopTokenParser: + """ + One can use this class if he or she does not want to do any parsing. + """ + + def process_token(self, token): + yield token + + def __call__(self, token): + return self.process_token(token) diff --git a/sourced/ml/core/algorithms/uast/__init__.py b/sourced/ml/core/algorithms/uast/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/algorithms/uast/id_distance.py b/sourced/ml/core/algorithms/uast/id_distance.py new file mode 100644 index 0000000..197d12f --- /dev/null +++ b/sourced/ml/core/algorithms/uast/id_distance.py @@ -0,0 +1,124 @@ +from itertools import combinations +from typing import Iterable, Tuple, Union + +import bblfsh + +from sourced.ml.core.algorithms.uast.ids_to_bag import UastIds2Bag +from sourced.ml.core.utils import bblfsh_roles + + +class Uast2IdDistance(UastIds2Bag): + """ + Converts a UAST to a list of identifiers pair and UAST distance between. + Distance metric must be defined in the inheritors. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + DEFAULT_MAX_DISTANCE = 10 # to avoid collecting all distances we skip too big ones + + def __init__(self, token2index=None, token_parser=None, max_distance=DEFAULT_MAX_DISTANCE): + """ + :param token2index: The mapping from tokens to token key. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + :param max_distance: specify to skip too distant identifiers + """ + super().__init__(token2index=token2index, token_parser=token_parser) + self.max_distance = max_distance + + def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]: + """ + Converts a UAST to a list of identifiers pair and UAST distance between. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: a list of (from identifier, to identifier) and distance pairs. + """ + for point1, point2 in combinations(self._process_uast(uast), 2): + if point1[0] == point2[0]: + continue # We do not want to calculate distance between the same identifiers + distance = self.distance(point1, point2) + if distance < self.max_distance: + yield ( + (point1[0], point2[0]) if point1[0] > point2[0] else (point2[0], point1[0]) + ), distance + + def distance(self, point1, point2) -> Union[int, float]: + """ + Calculate distance between two points. A point can be anything. self._process_uast returns + list of points in the specific class. + + :return: Distance between two points. + """ + raise NotImplementedError + + def _process_uast(self, node: bblfsh.Node) -> Iterable: + """ + Converts uast to points list. A point can be anything you need to calculate distance. + """ + raise NotImplementedError + + def _process_point(self, node, info): + if bblfsh_roles.IDENTIFIER in node.roles and node.token: + for sub in self._token_parser.process_token(node.token): + try: + yield (self._token2index[sub], info) + except KeyError: + continue + + +class Uast2IdTreeDistance(Uast2IdDistance): + """ + Converts a UAST to a list of identifiers pair and UAST tree distance between. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + def _process_uast(self, uast: bblfsh.Node) -> Iterable: + stack = [(uast, [])] + while stack: + node, ancestors = stack.pop() + yield from self._process_point(node, ancestors) + ancestors = list(ancestors) + ancestors.append(node) + stack.extend([(child, ancestors) for child in node.children]) + + def distance(self, point1, point2) -> int: + i = 0 + ancestors1 = point1[1] + ancestors2 = point2[1] + for i, (ancestor1, ancestor2) in enumerate(zip(ancestors1, ancestors2)): # noqa: B007 + if ancestor1 != ancestor2: + break + distance = self.calc_tree_distance(i, len(ancestors1), len(ancestors2)) + return distance + + @staticmethod + def calc_tree_distance(last_common_level, level1, level2): + return level1 + level2 - 2 * last_common_level + + +class Uast2IdLineDistance(Uast2IdDistance): + """ + Converts a UAST to a list of identifiers pair and code line distance between where applicable. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + def _process_uast(self, uast): + stack = [(uast, [0, 0])] + while stack: + node, last_position = stack.pop() + if node.start_position.line != 0: + # A lot of Nodes do not have position + # It is good heuristic to take the last Node in tree with a position. + last_position[0] = node.start_position.line + last_position[1] = 0 + if node.start_position.col != 0: + last_position[1] = node.start_position.col + yield from self._process_point(node, last_position) + stack.extend([(child, list(last_position)) for child in node.children]) + + def distance(self, point1, point2): + return abs(point1[1][0] - point2[1][0]) # subtract line numbers diff --git a/sourced/ml/core/algorithms/uast/ids_to_bag.py b/sourced/ml/core/algorithms/uast/ids_to_bag.py new file mode 100644 index 0000000..7a171d5 --- /dev/null +++ b/sourced/ml/core/algorithms/uast/ids_to_bag.py @@ -0,0 +1,110 @@ +from collections import defaultdict, deque + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser +from sourced.ml.core.algorithms.uast.to_bag import Uast2BagBase +from sourced.ml.core.utils import bblfsh_roles + + +def uast2sequence(root): + sequence = [] + nodes = defaultdict(deque) + stack = [root] + nodes[id(root)].extend(root.children) + while stack: + if nodes[id(stack[-1])]: + child = nodes[id(stack[-1])].popleft() + nodes[id(child)].extend(child.children) + stack.append(child) + else: + sequence.append(stack.pop()) + return sequence + + +class FakeVocabulary: + # FIXME(zurk): change to simple function. Vadim Markovtsev comments: + # > would rather made this a simple function and change roles2index + # type from [] to callable. Saves time to understand. + def __getitem__(self, item): + return item + + +class UastTokens2Bag(Uast2BagBase): + """ + Converts a UAST to a weighed bag of tokens via xpath. + """ + + XPATH = None # Should be overridden in child class + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'NoopTokenParser' is used if it is not specified. + """ + self._token2index = FakeVocabulary() if token2index is None else token2index + self._token_parser = NoopTokenParser() if token_parser is None else token_parser + + @property + def token_parser(self): + return self._token_parser + + @property + def token2index(self): + return self._token2index + + def __call__(self, uast): + """ + Converts a UAST to a weighed bag-of-words. The weights are words frequencies. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: + """ + nodes = bblfsh.filter(uast, self.XPATH) + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag + + +class UastIds2Bag(UastTokens2Bag): + """ + Converts a UAST to a bag-of-identifiers. + """ + + XPATH = "//*[@roleIdentifier]" + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + """ + token_parser = TokenParser() if token_parser is None else token_parser + super().__init__(token2index, token_parser) + + def __call__(self, uast): + """ + HOTFIX for https://github.com/bblfsh/client-python/issues/92 + Converts a UAST to a weighed bag-of-identifiers. The weights are identifiers frequencies. + The tokens are preprocessed by _token_parser. + Overwrite __call__ to avoid issues with `bblfsh.filter`. + + :param uast: The UAST root node. + :return: bag + """ + nodes = [node for node in uast2sequence(uast) if bblfsh_roles.IDENTIFIER in node.roles] + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag diff --git a/sourced/ml/core/algorithms/uast/inttypes_to_graphlets.py b/sourced/ml/core/algorithms/uast/inttypes_to_graphlets.py new file mode 100644 index 0000000..816d1b9 --- /dev/null +++ b/sourced/ml/core/algorithms/uast/inttypes_to_graphlets.py @@ -0,0 +1,60 @@ +from collections import defaultdict + +from sourced.ml.core.algorithms.uast.ids_to_bag import Uast2BagBase +from sourced.ml.core.algorithms.uast.struct_to_bag import Node + + +class Uast2GraphletBag(Uast2BagBase): + """ + Converts a UAST to a bag of graphlets. + The graphlet of a UAST node is composed from the node itself, its parent and its children. + Each node is represented by the internal role string. + """ + + @staticmethod + def _extract_node(node, parent): + return Node(parent=parent, internal_type=node.internal_type) + + def uast2graphlets(self, uast): + """ + :param uast: The UAST root node. + :generate: The nodes which compose the UAST. + :class: 'Node' is used to access the nodes of the graphlets. + """ + root = self._extract_node(uast, None) + stack = [(root, uast)] + while stack: + parent, parent_uast = stack.pop() + children_nodes = [self._extract_node(child, parent) for child in parent_uast.children] + parent.children = children_nodes + stack.extend(zip(children_nodes, parent_uast.children)) + yield parent + + def node2key(self, node): + """ + Builds the string joining internal types of all the nodes + in the node's graphlet in the following order: + parent_node_child1_child2_child3. The children are sorted by alphabetic order. + str format is required for BagsExtractor. + + :param node: a node of UAST + :return: The string key of node + """ + try: + parent_type = node.parent.internal_type + except AttributeError: + parent_type = None + key = [parent_type, node.internal_type] + key.extend(sorted(ch.internal_type for ch in node.children)) + return "_".join(map(str, key)) + + def __call__(self, uast): + """ + Converts a UAST to a weighed bag of graphlets. The weights are graphlets frequencies. + :param uast: The UAST root node. + :return: bag of graphlets. + """ + bag = defaultdict(int) + for node in self.uast2graphlets(uast): + bag[self.node2key(node)] += 1 + return bag diff --git a/sourced/ml/core/algorithms/uast/inttypes_to_nodes.py b/sourced/ml/core/algorithms/uast/inttypes_to_nodes.py new file mode 100644 index 0000000..c9ed05e --- /dev/null +++ b/sourced/ml/core/algorithms/uast/inttypes_to_nodes.py @@ -0,0 +1,65 @@ +from typing import Iterable, Tuple, Union + +from bblfsh import Node +import numpy + +from sourced.ml.core.algorithms.uast.to_bag import Uast2BagThroughSingleScan + + +class Uast2QuantizedChildren(Uast2BagThroughSingleScan): + """ + Converts a UAST to a bag of children counts. + """ + + def __init__(self, npartitions: int = 20): + self.npartitions = npartitions + self.levels = {} + + def node2key(self, node: Node) -> Union[str, Tuple[str, int]]: + """ + :param node: a node in UAST. + :return: The string which consists of the internal type of the node and its number of + children. + """ + if not self.levels: + return node.internal_type, len(node.children) + qm = self.levels[node.internal_type] + quant_index = numpy.searchsorted(qm, len(node.children), side="right") - 1 + return "%s_%d" % (node.internal_type, quant_index) + + def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]): + for key, vals in frequencies: + self.levels[key] = self.quantize_unwrapped(vals) + + # Somewhere before calling this keys are converted to int so it works with spark + def quantize_unwrapped(self, children_freq: Iterable[Tuple[int, int]]) -> numpy.ndarray: + """ + Builds the quantization partition P that is a vector of length nb_partitions \ + whose entries are in strictly ascending order. + Quantization of x is defined as: + 0 if x <= P[0] + m if P[m-1] < x <= P[m] + n if P[n] <= x + + :param children_freq: distribution of the number of children. + :return: The array with quantization levels. + """ + levels = numpy.zeros(self.npartitions + 1, dtype=numpy.int32) + children_freq = sorted(children_freq) + max_nodes_per_bin = sum(i[1] for i in children_freq) / self.npartitions + levels[0] = children_freq[0][0] + accum = children_freq[0][1] + i = 1 + for v, f in children_freq[1:]: + accum += f + if accum > max_nodes_per_bin: + accum = f + if i < len(levels): + levels[i] = v + i += 1 + last = children_freq[-1][0] + if i < len(levels): + levels[i:] = last + else: + levels[-1] = last + return levels diff --git a/sourced/ml/core/algorithms/uast/struct_to_bag.py b/sourced/ml/core/algorithms/uast/struct_to_bag.py new file mode 100644 index 0000000..1ca7b23 --- /dev/null +++ b/sourced/ml/core/algorithms/uast/struct_to_bag.py @@ -0,0 +1,199 @@ +from collections import defaultdict +import random + +from sourced.ml.core.algorithms.uast.ids_to_bag import FakeVocabulary, Uast2BagBase, uast2sequence + + +class Uast2StructBagBase(Uast2BagBase): + SEP = ">" + + def __init__(self, stride, seq_len, node2index=None): + self._node2index = node2index if node2index is not None else FakeVocabulary() + self._stride = stride + if not isinstance(seq_len, (int, tuple, list)): + raise TypeError("Unexpected type of seq_len: %s" % type(seq_len)) + self._seq_lens = [seq_len] if isinstance(seq_len, int) else seq_len + + @property + def node2index(self): + return self._node2index + + +class Node2InternalType: + # FIXME(zurk): change to simple function. Vadim Markovtsev comments: + # > would rather made this a simple function and change roles2index + # type from [] to callable. Saves time to understand. + def __getitem__(self, item): + return item.internal_type + + +class UastSeq2Bag(Uast2StructBagBase): + """ + DFS traversal + preserves the order of node children. + """ + + def __init__(self, stride=1, seq_len=(3, 4), node2index=None): + _node2index = Node2InternalType() if node2index is None else node2index + super().__init__(stride, seq_len, _node2index) + + def __call__(self, uast): + bag = defaultdict(int) + node_sequence = uast2sequence(uast) + + # convert to str - requirement from wmhash.BagsExtractor + node_sequence = [self.node2index[n] for n in node_sequence] + + for seq_len in self._seq_lens: + for i in range(0, len(node_sequence) - seq_len + 1, self._stride): + key = self.SEP.join(node_sequence[i : i + seq_len]) + bag[key] += 1 + return bag + + +class Node: + def __init__(self, parent=None, internal_type=None): + self.parent = parent + self.internal_type = internal_type + self.children = [] + + @property + def neighbours(self): + neighbours = [] + if self.parent is not None: + neighbours.append(self.parent) + neighbours.extend(self.children) + return neighbours + + +class Uast2RandomWalks: + """ + Generation of random walks for UAST. + """ + + def __init__( + self, + p_explore_neighborhood, + q_leave_neighborhood, + n_walks, + n_steps, + node2index=None, + seed=None, + ): + """ + Related article: https://arxiv.org/abs/1607.00653 + + :param p_explore_neighborhood: return parameter, p. Parameter p controls the likelihood of\ + immediately revisiting a node in the walk. Setting it to a\ + high value (> max(q, 1)) ensures that we are less likely to\ + sample an already visited node in the following two steps\ + (unless the next node in the walk had no other neighbor).\ + This strategy encourages moderate exploration and avoids\ + 2-hop redundancy in sampling. + :param q_leave_neighborhood: in-out parameter, q. Parameter q allows the search to\ + differentiate between “inward” and “outward” nodes. Such \ + walks obtain a local view of the underlying graph with \ + respect to the start node in the walk and approximate BFS \ + behavior in the sense that our samples comprise of nodes \ + within a small locality. + :param n_walks: Number of walks from each node. + :param n_steps: Number of steps in walk. + :param node2index: Specify node2index transformation. Node2InternalType() is used as \ + default. + :param seed: Random seed. + """ + self.p_explore_neighborhood = p_explore_neighborhood + self.q_leave_neighborhood = q_leave_neighborhood + self.n_walks = n_walks + self.n_steps = n_steps + self.node2index = node2index if node2index is not None else Node2InternalType() + if seed is not None: + random.seed(seed) + + def __call__(self, uast): + starting_nodes = self.prepare_starting_nodes(uast) + for _ in range(self.n_walks): + for start_node in starting_nodes: + yield self.random_walk(start_node) + + @staticmethod + def _extract_node(node, parent): + return Node(parent=parent, internal_type=node.internal_type) + + def prepare_starting_nodes(self, uast): + starting_nodes = [] + root = self._extract_node(uast, None) + stack = [(root, uast)] + while stack: + parent, parent_uast = stack.pop() + children_nodes = [self._extract_node(child, parent) for child in parent_uast.children] + parent.children = children_nodes + stack.extend(zip(children_nodes, parent_uast.children)) + starting_nodes.append(parent) + + return starting_nodes + + def random_walk(self, node): + walk = [node] + while len(walk) < self.n_steps: + walk.append(self.alias_sample(walk)) + + walk = [self.node2index[n] for n in walk] + return walk + + def alias_sample(self, walk): + """ + Compare to node2vec this sampling is a bit simpler because there is no loop in tree -> + so there are only 2 options with unnormalized probabilities 1/p & 1/q + Related article: https://arxiv.org/abs/1607.00653 + + :param walk: list of visited nodes + :return: next node to visit + """ + last_node = walk[-1] # correspond to node v in article + + if len(walk) == 1: + choice_list = last_node.children + if last_node.parent is not None: + choice_list.append(last_node.parent) + if len(choice_list) == 0: + return last_node + return random.choice(last_node.children) + + threshold = 1 / self.p_explore_neighborhood + threshold /= threshold + len(last_node.children) / self.q_leave_neighborhood + + if random.random() <= threshold: + # With threshold probability we need to return back to previous node. + return walk[-2] # Node from previous step. Correspond to node t in article. + + return random.choice(last_node.neighbours) + + +class UastRandomWalk2Bag(Uast2StructBagBase): + def __init__( + self, + p_explore_neighborhood=0.79, + q_leave_neighborhood=0.82, + n_walks=2, + n_steps=10, + stride=1, + seq_len=(2, 3), + seed=42, + ): + super().__init__(stride, seq_len) + self.uast2walks = Uast2RandomWalks( + p_explore_neighborhood=p_explore_neighborhood, + q_leave_neighborhood=q_leave_neighborhood, + n_walks=n_walks, + n_steps=n_steps, + seed=seed, + ) + + def __call__(self, uast): + bag = defaultdict(int) + for walk in self.uast2walks(uast): + for seq_len in self._seq_lens: + for i in range(0, len(walk) - seq_len + 1, self._stride): + # convert to str - requirement from wmhash.BagsExtractor + bag[self.SEP.join(walk[i : i + seq_len])] += 1 + return bag diff --git a/sourced/ml/core/algorithms/uast/to_bag.py b/sourced/ml/core/algorithms/uast/to_bag.py new file mode 100644 index 0000000..52650ec --- /dev/null +++ b/sourced/ml/core/algorithms/uast/to_bag.py @@ -0,0 +1,36 @@ +from collections import defaultdict +from typing import Dict + +from bblfsh import Node + + +class Uast2BagBase: + """ + Base class to convert UAST to a bag of anything. + """ + + def __call__(self, uast: Node): + """ + Inheritors must implement this function. + + :param uast: The UAST root node. + """ + raise NotImplementedError + + +class Uast2BagThroughSingleScan(Uast2BagBase): + """ + Constructs the bag by doing a single tree traversal and turning every node into a string. + """ + + def __call__(self, uast: Node) -> Dict[str, int]: + result = defaultdict(int) + stack = [uast] + while stack: + node = stack.pop() + stack.extend(node.children) + result[self.node2key(node)] += 1 + return result + + def node2key(self, node) -> str: + raise NotImplementedError diff --git a/sourced/ml/core/algorithms/uast/to_id_sequence.py b/sourced/ml/core/algorithms/uast/to_id_sequence.py new file mode 100644 index 0000000..e984674 --- /dev/null +++ b/sourced/ml/core/algorithms/uast/to_id_sequence.py @@ -0,0 +1,30 @@ +from typing import Iterable + +import bblfsh + +from sourced.ml.core.algorithms.uast.id_distance import Uast2IdLineDistance + + +class Uast2IdSequence(Uast2IdLineDistance): + """ + Converts a UAST to a sorted sequence of identifiers. + Identifiers are sorted by position in code. + We do not change the order if positions are not present. + + __call__ is overridden here and return list instead of bag-of-words (dist). + """ + + def __call__(self, uast: bblfsh.Node) -> str: + """ + Converts a UAST to a sorted sequence of identifiers. + Identifiers are sorted by position in code. + We do not change the order if positions are not present. + + :param uast: The UAST root node. + :return: string with a sequence of identifiers + """ + return self.concat(id for id, pos in sorted(self._process_uast(uast), key=lambda x: x[1])) + + @staticmethod + def concat(id_sequence: Iterable): + return " ".join(id_sequence) diff --git a/sourced/ml/core/algorithms/uast/to_role_id_pairs.py b/sourced/ml/core/algorithms/uast/to_role_id_pairs.py new file mode 100644 index 0000000..4c7f514 --- /dev/null +++ b/sourced/ml/core/algorithms/uast/to_role_id_pairs.py @@ -0,0 +1,69 @@ +from typing import Iterable, Tuple + +import bblfsh + +from sourced.ml.core.algorithms.uast.ids_to_bag import UastIds2Bag +from sourced.ml.core.utils import bblfsh_roles + + +class Uast2RoleIdPairs(UastIds2Bag): + """ + Converts a UAST to a list of pairs. Pair is identifier and role, where role is Node role + where identifier was found. + + __call__ is overridden here and returns list instead of bag-of-words (dist). + """ + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to token key. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + + """ + super().__init__(token2index=token2index, token_parser=token_parser) + self.exclude_roles = { + bblfsh_roles.EXPRESSION, + bblfsh_roles.IDENTIFIER, + bblfsh_roles.LEFT, + bblfsh_roles.QUALIFIED, + bblfsh_roles.BINARY, + bblfsh_roles.ASSIGNMENT, + } + + def __call__(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str]]: + """ + Converts a UAST to a list of identifier, role pairs. + The tokens are preprocessed by _token_parser. + + :param uast: The UAST root node. + :return: a list of identifier, role pairs. + """ + yield from self._process_uast(uast, []) + + def _process_uast(self, uast: bblfsh.Node, ancestors): + stack = [(uast, [])] + while stack: + node, ancestors = stack.pop() + + if bblfsh_roles.IDENTIFIER in node.roles and node.token: + roles = set(node.roles) + indx = -1 + # We skip all Nodes with roles from `self.exclude_roles` set. + # We skip any Node with OPERATOR role. + # For them we take first parent Node from stack with another Role set. + while not (roles - self.exclude_roles and bblfsh_roles.OPERATOR not in roles): + roles = set(ancestors[indx].roles) + indx -= 1 + for sub in self._token_parser.process_token(node.token): + try: + yield (self._token2index[sub], self.merge_roles(roles)) + except KeyError: + continue + ancestors = list(ancestors) + ancestors.append(node) + stack.extend([(child, ancestors) for child in node.children]) + + @staticmethod + def merge_roles(roles: Iterable[int]): + return " | ".join(bblfsh.role_name(r) for r in sorted(roles)) diff --git a/sourced/ml/core/extractors/__init__.py b/sourced/ml/core/extractors/__init__.py new file mode 100644 index 0000000..38a2157 --- /dev/null +++ b/sourced/ml/core/extractors/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from sourced.ml.core.extractors.identifier_distance import IdentifierDistance diff --git a/sourced/ml/core/extractors/bags_extractor.py b/sourced/ml/core/extractors/bags_extractor.py new file mode 100644 index 0000000..9906695 --- /dev/null +++ b/sourced/ml/core/extractors/bags_extractor.py @@ -0,0 +1,96 @@ +import bblfsh + +from sourced.ml.core.algorithms import NoopTokenParser, Uast2RoleIdPairs +from sourced.ml.core.utils import PickleableLogger + + +class Extractor(PickleableLogger): + """ + Converts a single UAST via `algorithm` to anything you need. + It is a wrapper to use in `Uast2Features` Transformer in a pipeline. + """ + + NAME = None # feature scheme name, should be overridden in the derived class. + ALGORITHM = None # algorithm class to extract from UAST + OPTS = {} # cmdline args which are passed into __init__() + + def _get_log_name(self): + return type(self).__name__ + + @classmethod + def get_kwargs_fromcmdline(cls, args): + prefix = cls.NAME + "_" + result = {} + for k, v in args.__dict__.items(): + if k.startswith(prefix): + result[k[len(prefix) :]] = v + return result + + def extract(self, uast: bblfsh.Node): + yield from self.ALGORITHM(uast) + + +class BagsExtractor(Extractor): + """ + Converts a single UAST into the weighted set (dictionary), where elements are strings + and the values are floats. The derived classes must implement uast_to_bag(). + """ + + DEFAULT_DOCFREQ_THRESHOLD = 5 + NAMESPACE = None # the beginning of each element in the bag + OPTS = {"weight": 1} # cmdline args which are passed into __init__() + + def __init__(self, docfreq_threshold=None, weight=None, **kwargs): + """ + :param docfreq_threshold: The minimum number of occurrences of an element to be included \ + into the bag + :param weight: TF-IDF will be multiplied by this weight to change importance of specific \ + bag extractor + :param kwargs: Parameters for parent constructor. + """ + super().__init__(**kwargs) + if docfreq_threshold is None: + docfreq_threshold = self.DEFAULT_DOCFREQ_THRESHOLD + self.docfreq_threshold = docfreq_threshold + self.docfreq = {} + self._ndocs = 0 + if weight is None: + self.weight = 1 + else: + self.weight = weight + + @property + def docfreq_threhold(self): + return self._docfreq_threshold + + @docfreq_threhold.setter + def docfreq_threshold(self, value): + if not isinstance(value, int): + raise TypeError("docfreq_threshold must be an integer, got %s" % type(value)) + if value < 1: + raise ValueError("docfreq_threshold must be >= 1, got %d" % value) + self._docfreq_threshold = value + + @property + def ndocs(self): + return self._ndocs + + @ndocs.setter + def ndocs(self, value): + if not isinstance(value, int): + raise TypeError("ndocs must be an integer, got %s" % type(value)) + if value < 1: + raise ValueError("ndocs must be >= 1, got %d" % value) + self._ndocs = value + + def extract(self, uast): + for key, val in self.uast_to_bag(uast).items(): + yield self.NAMESPACE + key, val * self.weight + + def uast_to_bag(self, uast): + raise NotImplementedError + + +class RoleIdsExtractor(Extractor): + NAME = "roleids" + ALGORITHM = Uast2RoleIdPairs(token_parser=NoopTokenParser()) diff --git a/sourced/ml/core/extractors/children.py b/sourced/ml/core/extractors/children.py new file mode 100644 index 0000000..0963189 --- /dev/null +++ b/sourced/ml/core/extractors/children.py @@ -0,0 +1,53 @@ +import logging +from typing import Iterable, Tuple + +from sourced.ml.core.algorithms import Uast2QuantizedChildren +from sourced.ml.core.extractors import ( + BagsExtractor, + filter_kwargs, + get_names_from_kwargs, + register_extractor, +) + + +@register_extractor +class ChildrenBagExtractor(BagsExtractor): + """ + Converts a UAST to the bag of pairs (internal type, quantized number of children). + """ + + NAME = "children" + NAMESPACE = "c." + OPTS = dict(get_names_from_kwargs(Uast2QuantizedChildren.__init__)) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, Uast2QuantizedChildren.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast_to_bag = Uast2QuantizedChildren(**uast2bag_kwargs) + + @property + def npartitions(self): + return self.uast_to_bag.npartitions + + @property + def levels(self): + return self.uast_to_bag.levels + + def extract(self, uast): + if not self.uast_to_bag.levels: + # bypass NAMESPACE + gen = self.uast_to_bag(uast).items() + else: + gen = super().extract(uast) + for key, val in gen: + yield key, val + + def quantize(self, frequencies: Iterable[Tuple[str, Iterable[Tuple[int, int]]]]): + self.uast_to_bag.quantize(frequencies) + if self._log.isEnabledFor(logging.DEBUG): + for k, v in self.uast_to_bag.levels.items(): + self._log.debug("%s\n%s", k, v) diff --git a/sourced/ml/core/extractors/graphlets.py b/sourced/ml/core/extractors/graphlets.py new file mode 100644 index 0000000..ef15b12 --- /dev/null +++ b/sourced/ml/core/extractors/graphlets.py @@ -0,0 +1,28 @@ +from sourced.ml.core.algorithms.uast.inttypes_to_graphlets import Uast2GraphletBag +from sourced.ml.core.extractors import ( + BagsExtractor, + filter_kwargs, + get_names_from_kwargs, + register_extractor, +) + + +@register_extractor +class GraphletBagExtractor(BagsExtractor): + NAME = "graphlet" + NAMESPACE = "g." + OPTS = dict(get_names_from_kwargs(Uast2GraphletBag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + uast2bag_kwargs = filter_kwargs(kwargs, Uast2GraphletBag.__init__) + self.uast2bag = Uast2GraphletBag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/extractors/helpers.py b/sourced/ml/core/extractors/helpers.py new file mode 100644 index 0000000..7e6d0b6 --- /dev/null +++ b/sourced/ml/core/extractors/helpers.py @@ -0,0 +1,38 @@ +import argparse +import inspect +from typing import List + +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + +__extractors__ = {} + + +def register_extractor(cls): + if not issubclass(cls, BagsExtractor): + raise TypeError("%s is not an instance of %s" % (cls.__name__, BagsExtractor.__name__)) + __extractors__[cls.NAME] = cls + return cls + + +def get_names_from_kwargs(func): + for k, v in inspect.signature(func).parameters.items(): + if v.default != inspect.Parameter.empty and isinstance( + v.default, (str, int, float, tuple) + ): + yield k.replace("_", "-"), v.default + + +def filter_kwargs(kwargs, func): + func_param = inspect.signature(func).parameters.keys() + return {k: v for k, v in kwargs.items() if k in func_param} + + +def create_extractors_from_args(args: argparse.Namespace) -> List[BagsExtractor]: + return [ + __extractors__[s]( + args.min_docfreq, + log_level=args.log_level, + **__extractors__[s].get_kwargs_fromcmdline(args) + ) + for s in args.feature + ] diff --git a/sourced/ml/core/extractors/id_sequence.py b/sourced/ml/core/extractors/id_sequence.py new file mode 100644 index 0000000..1bafe5e --- /dev/null +++ b/sourced/ml/core/extractors/id_sequence.py @@ -0,0 +1,34 @@ +from typing import Iterable + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.to_id_sequence import Uast2IdSequence +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + + +class IdSequenceExtractor(BagsExtractor): + """ + Extractor wrapper for Uast2RoleIdPairs algorithm. + Note that this is unusual BagsExtractor since it returns iterable instead of bag. + + The class did not wrap with @register_extractor because it does not produce bags as others do. + So nobody outside code will see it or use it directly. + For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want. + + TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor), + re-inherit this class from Extractor, delete explanations from docstring. + """ + + NAMESPACE = "" + NAME = "id sequence" + OPTS = {} + + def __init__(self, split_stem=False, **kwargs): + super().__init__(**kwargs) + self.uast2id_sequence = Uast2IdSequence( + None, NoopTokenParser() if not split_stem else None + ) + + def extract(self, uast: bblfsh.Node) -> Iterable[str]: + yield self.uast2id_sequence(uast), None diff --git a/sourced/ml/core/extractors/identifier_distance.py b/sourced/ml/core/extractors/identifier_distance.py new file mode 100644 index 0000000..58b9c9d --- /dev/null +++ b/sourced/ml/core/extractors/identifier_distance.py @@ -0,0 +1,53 @@ +from typing import Iterable, Tuple + +import bblfsh + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.id_distance import ( + Uast2IdDistance, + Uast2IdLineDistance, + Uast2IdTreeDistance, +) +from sourced.ml.core.extractors.bags_extractor import BagsExtractor + + +class IdentifierDistance(BagsExtractor): + """ + Extractor wrapper for Uast2IdTreeDistance and Uast2IdLineDistance algorithm. + Note that this is an unusual BagsExtractor since it returns iterable instead of bag. + + The class did not wrap with @register_extractor because it does not produce bags as others do. + So nobody outside code will see it or use it directly. + For the same reason we a free to override NAMESPACE, NAME, OPTS fields with any value we want. + + TODO(zurk): Split BagsExtractor into two clases: Extractor and BagsExtractor(Extractor), + re-inherit this class from Extractor, delete explanations from docstring. + """ + + NAMESPACE = "" + NAME = "Identifier distance" + OPTS = {} + DEFAULT_MAX_DISTANCE = Uast2IdDistance.DEFAULT_MAX_DISTANCE + + class DistanceType: + Tree = "tree" + Line = "line" + All = {Tree, Line} + + @staticmethod + def resolve(type): + if type == IdentifierDistance.DistanceType.Line: + return Uast2IdLineDistance + if type == IdentifierDistance.DistanceType.Tree: + return Uast2IdTreeDistance + raise ValueError("Unknown distance type: %s" % type) + + def __init__(self, split_stem=False, type="tree", max_distance=DEFAULT_MAX_DISTANCE, **kwargs): + super().__init__(**kwargs) + Uast2IdDistance = self.DistanceType.resolve(type) + self.uast2id_distance = Uast2IdDistance( + token_parser=NoopTokenParser() if not split_stem else None, max_distance=max_distance + ) + + def extract(self, uast: bblfsh.Node) -> Iterable[Tuple[str, str, int]]: + yield from self.uast2id_distance(uast) diff --git a/sourced/ml/core/extractors/identifiers.py b/sourced/ml/core/extractors/identifiers.py new file mode 100644 index 0000000..9bd4764 --- /dev/null +++ b/sourced/ml/core/extractors/identifiers.py @@ -0,0 +1,19 @@ +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.ids_to_bag import UastIds2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import register_extractor + + +@register_extractor +class IdentifiersBagExtractor(BagsExtractor): + NAME = "id" + NAMESPACE = "i." + OPTS = {"split-stem": True} + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, split_stem=True, **kwargs): + super().__init__(docfreq_threshold, **kwargs) + self.id2bag = UastIds2Bag(None, NoopTokenParser() if not split_stem else None) + + def uast_to_bag(self, uast): + return self.id2bag(uast) diff --git a/sourced/ml/core/extractors/literals.py b/sourced/ml/core/extractors/literals.py new file mode 100644 index 0000000..6e8052f --- /dev/null +++ b/sourced/ml/core/extractors/literals.py @@ -0,0 +1,69 @@ +import codecs +from collections import defaultdict +import os + +# TODO (Guillemdb): fix imports +from sourced.ml.core.algorithms.uast.ids_to_bag import uast2sequence, UastIds2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import register_extractor +from sourced.ml.core.utils import bblfsh_roles + + +class HashedTokenParser: + def process_token(self, token): + yield codecs.encode( + (hash(token) & 0xFFFFFFFFFFFFFFFF).to_bytes(8, "little"), "hex_codec" + ).decode() + + +class Literals2Bag(UastIds2Bag): + """ + Converts a UAST to a bag-of-literals. + """ + + XPATH = "//*[@roleLiteral]" + + def __init__(self, token2index=None, token_parser=None): + """ + :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed. + :param token_parser: Specify token parser if you want to use a custom one. \ + :class:'TokenParser' is used if it is not specified. + """ + token_parser = HashedTokenParser() if token_parser is None else token_parser + super().__init__(token2index, token_parser) + + def __call__(self, uast): + """ + HOTFIX for https://github.com/bblfsh/client-python/issues/92 + Converts a UAST to a weighed bag-of-literals. The weights are literals frequencies. + The tokens are preprocessed by _token_parser. + Overwrite __call__ to avoid issues with `bblfsh.filter`. + + :param uast: The UAST root node. + :return: bag + """ + nodes = [node for node in uast2sequence(uast) if bblfsh_roles.LITERAL in node.roles] + bag = defaultdict(int) + for node in nodes: + for sub in self._token_parser.process_token(node.token): + try: + bag[self._token2index[sub]] += 1 + except KeyError: + continue + return bag + + +@register_extractor +class LiteralsBagExtractor(BagsExtractor): + NAME = "lit" + NAMESPACE = "l." + OPTS = BagsExtractor.OPTS.copy() + + def __init__(self, docfreq_threshold=None, **kwargs): + super().__init__(docfreq_threshold, **kwargs) + self.id2bag = Literals2Bag(None, HashedTokenParser()) + + def uast_to_bag(self, uast): + if os.getenv("PYTHONHASHSEED", "random") == "random": + raise RuntimeError("PYTHONHASHSEED must be set") + return self.id2bag(uast) diff --git a/sourced/ml/core/extractors/uast_random_walk.py b/sourced/ml/core/extractors/uast_random_walk.py new file mode 100644 index 0000000..1ea76eb --- /dev/null +++ b/sourced/ml/core/extractors/uast_random_walk.py @@ -0,0 +1,28 @@ +# TODO (Guillemdb): fix imports +from sourced.ml.core.algorithms.uast.struct_to_bag import UastRandomWalk2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import ( + filter_kwargs, + get_names_from_kwargs, + register_extractor, +) + + +@register_extractor +class UastRandomWalkBagExtractor(BagsExtractor): + NAME = "node2vec" + NAMESPACE = "r." + OPTS = dict(get_names_from_kwargs(UastRandomWalk2Bag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, UastRandomWalk2Bag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast2bag = UastRandomWalk2Bag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/extractors/uast_seq.py b/sourced/ml/core/extractors/uast_seq.py new file mode 100644 index 0000000..bcdc2cc --- /dev/null +++ b/sourced/ml/core/extractors/uast_seq.py @@ -0,0 +1,28 @@ +# TODO (Guillemdb): fix imports +from sourced.ml.core.algorithms.uast.struct_to_bag import UastSeq2Bag +from sourced.ml.core.extractors.bags_extractor import BagsExtractor +from sourced.ml.core.extractors.helpers import ( + filter_kwargs, + get_names_from_kwargs, + register_extractor, +) + + +@register_extractor +class UastSeqBagExtractor(BagsExtractor): + NAME = "uast2seq" + NAMESPACE = "s." + OPTS = dict(get_names_from_kwargs(UastSeq2Bag.__init__)) + OPTS.update(BagsExtractor.OPTS) + + def __init__(self, docfreq_threshold=None, **kwargs): + original_kwargs = kwargs + uast2bag_kwargs = filter_kwargs(kwargs, UastSeq2Bag.__init__) + for k in uast2bag_kwargs: + kwargs.pop(k) + super().__init__(docfreq_threshold, **kwargs) + self._log.debug("__init__ %s", original_kwargs) + self.uast2bag = UastSeq2Bag(**uast2bag_kwargs) + + def uast_to_bag(self, uast): + return self.uast2bag(uast) diff --git a/sourced/ml/core/modelforgecfg.py b/sourced/ml/core/modelforgecfg.py new file mode 100644 index 0000000..5148c1f --- /dev/null +++ b/sourced/ml/core/modelforgecfg.py @@ -0,0 +1,8 @@ +import os + + +VENDOR = "source{d}" +BACKEND = "gcs" +BACKEND_ARGS = "bucket=models.cdn.sourced.tech" +INDEX_REPO = "https://github.com/src-d/models" +CACHE_DIR = os.path.join(os.path.expanduser("~"), ".cache", "source{d}") diff --git a/sourced/ml/core/models/__init__.py b/sourced/ml/core/models/__init__.py new file mode 100644 index 0000000..5edb673 --- /dev/null +++ b/sourced/ml/core/models/__init__.py @@ -0,0 +1,12 @@ +# flake8: noqa +from sourced.ml.core.models.bow import BOW +from sourced.ml.core.models.coocc import Cooccurrences +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies +from sourced.ml.core.models.id2vec import Id2Vec +from sourced.ml.core.models.tensorflow import TensorFlowModel +from sourced.ml.core.models.topics import Topics +from sourced.ml.core.models.quant import QuantizationLevels + +from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq +from sourced.ml.core.models.model_converters.merge_bow import MergeBOW diff --git a/sourced/ml/core/models/bow.py b/sourced/ml/core/models/bow.py new file mode 100644 index 0000000..2d0a18f --- /dev/null +++ b/sourced/ml/core/models/bow.py @@ -0,0 +1,148 @@ +import logging +from typing import Dict, Iterable, List + +from modelforge import ( + assemble_sparse_matrix, + disassemble_sparse_matrix, + merge_strings, + Model, + register_model, + split_strings, +) +from modelforge.progress_bar import progress_bar +from scipy import sparse + +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class BOW(Model): + """ + Weighted bag of words model. Every word is correspond to an index and its matrix column. + Bag is a word set from repository, file or anything else. + Word is source code identifier or its part. + This model depends on :class:`core.models.DocumentFrequencies`. + """ + + NAME = "bow" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains source code as weighted bag of words." + LICENSE = DEFAULT_LICENSE + + @property + def matrix(self) -> sparse.spmatrix: + """ + Returns the bags as a sparse matrix. Rows are documents and columns are tokens weight. + """ + return self._matrix + + @property + def documents(self): + """ + The list of documents in the model. + """ + return self._documents + + @property + def tokens(self): + """ + The list of tokens in the model. + """ + return self._tokens + + def __getitem__(self, item: int): + """ + Returns document name, word indices and weights for the given document index. + + :param item: Document index. + :return: (name, :class:`numpy.ndarray` with word indices, \ + :class:`numpy.ndarray` with weights) + """ + data = self._matrix[item] + return self._documents[item], data.indices, data.data + + def __iter__(self): + """ + Returns an iterator over the document indices. + """ + return iter(range(len(self))) + + def __len__(self): + """ + Returns the number of documents. + """ + return len(self._documents) + + def construct(self, documents: List[str], tokens: List[str], matrix: sparse.spmatrix): + if matrix.shape[0] != len(documents): + raise ValueError( + "matrix shape mismatch, documents %d != %d" % (matrix.shape[0], len(documents)) + ) + if matrix.shape[1] != len(tokens): + raise ValueError( + "matrix shape mismatch, tokens %d != %d" % (matrix.shape[1], len(tokens)) + ) + self._documents = documents + self._matrix = matrix + self._tokens = tokens + return self + + def dump(self): + return ( + "Shape: %s\n" + "First 10 documents: %s\n" + "First 10 tokens: %s" % (self._matrix.shape, self._documents[:10], self.tokens[:10]) + ) + + def save( + self, output: str, series: str, deps: Iterable = tuple(), create_missing_dirs: bool = True + ): + if not deps: + try: + deps = [self.get_dep(DocumentFrequencies.NAME)] + except KeyError: + raise ValueError( + "You must specify DocumentFrequencies dependency to save BOW." + ) from None + super().save( + output=output, series=series, deps=deps, create_missing_dirs=create_missing_dirs + ) + + def convert_bow_to_vw(self, output: str): + log = logging.getLogger("bow2vw") + log.info("Writing %s", output) + with open(output, "w") as fout: + for index in progress_bar(self, log, expected_size=len(self)): + record = self[index] + fout.write(record[0].replace(":", "").replace(" ", "_") + " ") + pairs = [] + for t, v in zip(*record[1:]): + try: + word = self.tokens[t] + except (KeyError, IndexError): + log.warning("%d not found in the vocabulary", t) + continue + pairs.append("%s:%s" % (word, v)) + fout.write(" ".join(pairs)) + fout.write("\n") + + def documents_index(self) -> Dict[str, int]: + return {r: i for i, r in enumerate(self._documents)} + + def _generate_tree(self): + return { + "documents": merge_strings(self._documents), + "matrix": disassemble_sparse_matrix(self._matrix), + "tokens": merge_strings(self.tokens), + } + + def _load_tree_kwargs(self, tree: dict): + return { + "documents": split_strings(tree["documents"]), + "matrix": assemble_sparse_matrix(tree["matrix"]), + "tokens": split_strings(tree["tokens"]), + } + + def _load_tree(self, tree: dict): + self.construct(**self._load_tree_kwargs(tree)) diff --git a/sourced/ml/core/models/coocc.py b/sourced/ml/core/models/coocc.py new file mode 100644 index 0000000..f18a074 --- /dev/null +++ b/sourced/ml/core/models/coocc.py @@ -0,0 +1,75 @@ +from modelforge.model import ( + assemble_sparse_matrix, + disassemble_sparse_matrix, + merge_strings, + Model, + split_strings, +) +from modelforge.models import register_model + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Cooccurrences(Model): + """ + Co-occurrence matrix. + """ + + NAME = "co-occurrences" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains the sparse co-occurrence matrix of source code identifiers." + LICENSE = DEFAULT_LICENSE + + def construct(self, tokens, matrix): + self._tokens = tokens + self._matrix = matrix + return self + + def _load_tree(self, tree): + self.construct( + tokens=split_strings(tree["tokens"]), matrix=assemble_sparse_matrix(tree["matrix"]) + ) + + def dump(self): + return """Number of words: %d +First 10 words: %s +Matrix: shape: %s non-zero: %d""" % ( + len(self.tokens), + self.tokens[:10], + self.matrix.shape, + self.matrix.getnnz(), + ) + + @property + def tokens(self): + """ + Returns the tokens in the order which corresponds to the matrix's rows and cols. + """ + return self._tokens + + @property + def matrix(self): + """ + Returns the sparse co-occurrence matrix. + """ + return self._matrix + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._tokens) + + def _generate_tree(self): + return { + "tokens": merge_strings(self.tokens), + "matrix": disassemble_sparse_matrix(self.matrix), + } + + def matrix_to_rdd(self, spark_context: "pyspark.SparkContext") -> "pyspark.RDD": + self._log.info("Convert coocc model to RDD...") + rdd_row = spark_context.parallelize(self._matrix.row) + rdd_col = spark_context.parallelize(self._matrix.col) + rdd_data = spark_context.parallelize(self._matrix.data) + return rdd_row.zip(rdd_col).zip(rdd_data) diff --git a/sourced/ml/core/models/df.py b/sourced/ml/core/models/df.py new file mode 100644 index 0000000..8908d11 --- /dev/null +++ b/sourced/ml/core/models/df.py @@ -0,0 +1,175 @@ +from itertools import islice +from typing import Dict, Iterable, List, Union + +from modelforge import merge_strings, Model, register_model, split_strings +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + +# TODO(guillemdb) method ordering. prop, dunder, public, protected, private + + +@register_model +class DocumentFrequencies(Model): + """ + Document frequencies - number of times a source code identifier appeared + in different repositories. Each repository counts only once. + """ + + NAME = "docfreq" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains document frequencies of features extracted from code." + LICENSE = DEFAULT_LICENSE + + def construct(self, docs: int, tokfreqs: Union[Iterable[Dict[str, int]], Dict[str, int]]): + """ + Initializes this model. + :param docs: The number of documents. + :param tokfreqs: The dictionary of token -> frequency or the iterable collection of such + dictionaries. + :return: self + """ + if isinstance(tokfreqs, dict): + df = tokfreqs + else: + df = {} + for d in tokfreqs: + df.update(d) + self._docs = docs + self._df = df + return self + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ + + def _load_tree(self, tree: dict, tokens=None): + if tokens is None: + tokens = split_strings(tree["tokens"]) + freqs = tree["freqs"] + self._log.info("Building the docfreq dictionary...") + tokfreq = dict(zip(tokens, freqs)) + self.construct(docs=tree["docs"], tokfreqs=tokfreq) + + def _generate_tree(self): + tokens = self.tokens() + freqs = numpy.array([self._df[t] for t in tokens], dtype=numpy.float32) + return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs} + + def dump(self): + return """Number of words: %d +Random 10 words: %s +Number of documents: %d""" % ( + len(self._df), + dict(islice(self._df.items(), 10)), + self.docs, + ) + + @property + def docs(self) -> int: + """ + Returns the number of documents. + """ + return self._docs + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ + + def prune(self, threshold: int) -> "DocumentFrequencies": + """ + Removes tokens which occur less than `threshold` times. + The operation happens *not* in-place - a new model is returned. + :param threshold: Minimum number of occurrences. + :return: The new model if the current one had to be changed, otherwise self. + """ + if threshold < 1: + raise ValueError("Invalid threshold: %d" % threshold) + if threshold == 1: + return self + self._log.info("Pruning to min %d occurrences", threshold) + pruned = type(self)() + pruned._docs = self.docs + pruned._df = {k: v for k, v in self._df.items() if v >= threshold} + self._log.info("Size: %d -> %d", len(self), len(pruned)) + pruned._meta = self.meta + return pruned + + def greatest(self, max_size: int) -> "DocumentFrequencies": + """ + Truncates the model to most frequent `max_size` tokens. + The operation happens *not* in-place - a new model is returned. + :param max_size: The maximum vocabulary size. + :return: The new model if the current one had to be changed, otherwise self. + """ + if max_size < 1: + raise ValueError("Invalid max_size: %d" % max_size) + if len(self) <= max_size: + return self + self._log.info("Pruning to max %d size", max_size) + pruned = type(self)() + pruned._docs = self.docs + freqs = numpy.fromiter(self._df.values(), dtype=numpy.int32, count=len(self)) + keys = numpy.array(list(self._df.keys()), dtype=object) + chosen = numpy.argpartition(freqs, len(freqs) - max_size)[len(freqs) - max_size :] + border_freq = freqs[chosen].min() + chosen = freqs >= border_freq + # argpartition can leave some of the elements with freq == border_freq outside + # so next step ensures that we include everything. + freqs = freqs[chosen] + keys = keys[chosen] + # we need to be deterministic at the cutoff frequency + # argpartition returns random samples every time + # so we treat words with the cutoff frequency separately + if max_size != freqs.shape[0]: + assert max_size < freqs.shape[0] + border_freq_indexes = freqs == border_freq + border_keys = keys[border_freq_indexes] + border_keys.sort() + border_keys = border_keys[: max_size - freqs.shape[0]] + df = dict(zip(keys[~border_freq_indexes], freqs[~border_freq_indexes])) + df.update({key: border_freq for key in border_keys}) + else: + df = dict(zip(keys, freqs)) + pruned._df = df + self._log.info("Size: %d -> %d", len(self), len(pruned)) + pruned._meta = self.meta + return pruned + + def __getitem__(self, item): + return self._df[item] + + def __iter__(self): + return iter(self._df.items()) + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._df) + + def get(self, item, default=None) -> Union[int, None]: + """ + Return the document frequency for a given token. + + :param item: The token to query. + :param default: Returned value in case the token is missing. + :return: int or `default` + """ + return self._df.get(item, default) + + def tokens(self) -> List[str]: + """ + Returns the list of tokens. + """ + return list(self._df) + + """ + WE DO NOT ADD THIS + + def df(self) -> dict: + """ diff --git a/sourced/ml/core/models/id2vec.py b/sourced/ml/core/models/id2vec.py new file mode 100644 index 0000000..368c8b4 --- /dev/null +++ b/sourced/ml/core/models/id2vec.py @@ -0,0 +1,68 @@ +from modelforge import merge_strings, Model, register_model, split_strings + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Id2Vec(Model): + """ + id2vec model - source code identifier embeddings. + """ + + NAME = "id2vec" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains information on source code as identifier embeddings." + LICENSE = DEFAULT_LICENSE + + def construct(self, embeddings, tokens): + self._embeddings = embeddings + self._tokens = tokens + self._log.info("Building the token index...") + self._token2index = {w: i for i, w in enumerate(self._tokens)} + return self + + def _load_tree(self, tree): + self.construct(embeddings=tree["embeddings"].copy(), tokens=split_strings(tree["tokens"])) + + def dump(self): + return """Shape: %s +First 10 words: %s""" % ( + self.embeddings.shape, + self.tokens[:10], + ) + + @property + def embeddings(self): + """ + :class:`numpy.ndarray` with the embeddings of shape + (N tokens x embedding dims). + """ + return self._embeddings + + @property + def tokens(self): + """ + List with the processed source code identifiers. + """ + return self._tokens + + def items(self): + """ + Returns the tuples belonging to token -> index mapping. + """ + return self._token2index.items() + + def __getitem__(self, item): + """ + Returns the index of the specified processed source code identifier. + """ + return self._token2index[item] + + def __len__(self): + """ + Returns the number of tokens in the model. + """ + return len(self._tokens) + + def _generate_tree(self): + return {"embeddings": self.embeddings, "tokens": merge_strings(self.tokens)} diff --git a/sourced/ml/core/models/license.py b/sourced/ml/core/models/license.py new file mode 100644 index 0000000..9ebe479 --- /dev/null +++ b/sourced/ml/core/models/license.py @@ -0,0 +1,3 @@ +"""Default license used for the models.""" + +DEFAULT_LICENSE = "ODbL-1.0" diff --git a/sourced/ml/core/models/model_converters/__init__.py b/sourced/ml/core/models/model_converters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/models/model_converters/base.py b/sourced/ml/core/models/model_converters/base.py new file mode 100644 index 0000000..66d119e --- /dev/null +++ b/sourced/ml/core/models/model_converters/base.py @@ -0,0 +1,125 @@ +import logging +import multiprocessing +import os +from typing import List, Union + +from modelforge import Model +from modelforge.progress_bar import progress_bar + +from sourced.ml.core.utils.pickleable_logger import PickleableLogger + + +class Model2Base(PickleableLogger): + """ + Base class for model -> model conversions. + """ + + MODEL_FROM_CLASS = None + MODEL_TO_CLASS = None + + def __init__( + self, + num_processes: int = 0, + log_level: int = logging.DEBUG, + overwrite_existing: bool = True, + ): + """ + Initializes a new instance of Model2Base class. + + :param num_processes: The number of processes to execute for conversion. + :param log_level: Logging verbosity level. + :param overwrite_existing: Rewrite existing models or skip them. + """ + super().__init__(log_level=log_level) + self.num_processes = multiprocessing.cpu_count() if num_processes == 0 else num_processes + self.overwrite_existing = overwrite_existing + + def convert(self, models_path: List[str], destdir: str) -> int: + """ + Performs the model -> model conversion. Runs the conversions in a pool of processes. + + :param models_path: List of Models path. + :param destdir: The directory where to store the models. The directory structure is \ + preserved. + :return: The number of converted files. + """ + files = list(models_path) + self._log.info("Found %d files", len(files)) + if not files: + return 0 + queue_in = multiprocessing.Manager().Queue() + queue_out = multiprocessing.Manager().Queue(1) + processes = [ + multiprocessing.Process( + target=self._process_entry, args=(i, destdir, queue_in, queue_out) + ) + for i in range(self.num_processes) + ] + for p in processes: + p.start() + for f in files: + queue_in.put(f) + for _ in processes: + queue_in.put(None) + failures = 0 + for _ in progress_bar(files, self._log, expected_size=len(files)): + filename, ok = queue_out.get() + if not ok: + failures += 1 + for p in processes: + p.join() + self._log.info("Finished, %d failed files", failures) + return len(files) - failures + + def convert_model(self, model: Model) -> Union[Model, None]: + """ + This must be implemented in the child classes. + + :param model: The model instance to convert. + :return: The converted model instance or None if it is not needed. + """ + raise NotImplementedError + + def finalize(self, index: int, destdir: str): + """ + Called for each worker in the end of the processing. + + :param index: Worker's index. + :param destdir: The directory where to store the models. + """ + pass + + def _process_entry(self, index, destdir, queue_in, queue_out): + while True: + filepath = queue_in.get() + if filepath is None: + break + try: + model_path = os.path.join(destdir, os.path.split(filepath)[1]) + if os.path.exists(model_path): + if self.overwrite_existing: + self._log.warning( + "Model %s already exists, but will be overwrite. If you want to " + "skip existing models use --disable-overwrite flag", + model_path, + ) + else: + self._log.warning("Model %s already exists, skipping.", model_path) + queue_out.put((filepath, True)) + continue + model_from = self.MODEL_FROM_CLASS(log_level=self._log.level).load(filepath) + model_to = self.convert_model(model_from) + if model_to is not None: + dirs = os.path.dirname(model_path) + if dirs: + os.makedirs(dirs, exist_ok=True) + model_to.save(model_path, deps=model_to.meta["dependencies"]) + except: # noqa + self._log.exception("%s failed", filepath) + queue_out.put((filepath, False)) + else: + queue_out.put((filepath, True)) + self.finalize(index, destdir) + + def _get_log_name(self): + return "%s2%s" % (self.MODEL_FROM_CLASS.NAME, self.MODEL_TO_CLASS.NAME) diff --git a/sourced/ml/core/models/model_converters/merge_bow.py b/sourced/ml/core/models/model_converters/merge_bow.py new file mode 100644 index 0000000..c7564e5 --- /dev/null +++ b/sourced/ml/core/models/model_converters/merge_bow.py @@ -0,0 +1,71 @@ +import os + +from scipy.sparse import vstack + +from sourced.ml.core import extractors +from sourced.ml.core.models.bow import BOW +from sourced.ml.core.models.model_converters.base import Model2Base + + +class MergeBOW(Model2Base): + """ + Merges several :class:`BOW` models together. + """ + + MODEL_FROM_CLASS = BOW + MODEL_TO_CLASS = BOW + + def __init__(self, features=None, *args, **kwargs): + super().__init__(num_processes=1, *args, **kwargs) + self.documents = None + self.tokens = None + self.matrix = None + self.deps = None + self.features_namespaces = None + if features: + self.features_namespaces = [ + ex.NAMESPACE for ex in extractors.__extractors__.values() if ex.NAME in features + ] + + def convert_model(self, model: BOW) -> None: + if self.tokens is None: + self.tokens = model.tokens + self.documents = model.documents + self.matrix = [model.matrix.tocsr()] + self.deps = model._meta["dependencies"] + elif set(self.tokens) != set(model.tokens): + raise ValueError("Models don't share the same set of tokens !") + else: + self.documents += model.documents + self.matrix.append(model.matrix.tocsr()) + + def finalize(self, index: int, destdir: str): + self._log.info("Stacking matrices ...") + matrix = self.matrix.pop(0) + while self.matrix: + matrix = vstack([matrix, self.matrix.pop(0)]) + self._log.info("%s matrices to stack ...", len(self.matrix)) + self.matrix = matrix + self._log.info("Writing model ...") + if self.features_namespaces: + self._reduce_matrix() + BOW(log_level=self._log.level).construct(self.documents, self.tokens, self.matrix).save( + output=self._save_path(index, destdir), series="id2vec", deps=self.deps + ) + + def _reduce_matrix(self): + reduced_tokens = [] + columns = [] + matrix = self.matrix.tocsc() + for i, token in enumerate(self.tokens): + if token.split(".")[0] in self.features_namespaces: + reduced_tokens.append(token) + columns.append(i) + self.tokens = reduced_tokens + self.matrix = matrix[:, columns] + + @staticmethod + def _save_path(index: int, destdir: str): + if destdir.endswith(".asdf"): + return destdir + return os.path.join(destdir, "bow_%d.asdf" % index) diff --git a/sourced/ml/core/models/model_converters/merge_df.py b/sourced/ml/core/models/model_converters/merge_df.py new file mode 100644 index 0000000..2323214 --- /dev/null +++ b/sourced/ml/core/models/model_converters/merge_df.py @@ -0,0 +1,44 @@ +from collections import defaultdict +import os + +from sourced.ml.core.models.df import DocumentFrequencies +from sourced.ml.core.models.model_converters.base import Model2Base +from sourced.ml.core.models.ordered_df import OrderedDocumentFrequencies + + +class MergeDocFreq(Model2Base): + """ + Merges several :class:`DocumentFrequencies` models together. + """ + + MODEL_FROM_CLASS = DocumentFrequencies + MODEL_TO_CLASS = DocumentFrequencies + + def __init__( + self, min_docfreq: int, vocabulary_size: int, ordered: bool = False, *args, **kwargs + ): + super().__init__(num_processes=1, *args, **kwargs) + self.ordered = ordered + self.min_docfreq = min_docfreq + self.vocabulary_size = vocabulary_size + self._df = defaultdict(int) + self._docs = 0 + + def convert_model(self, model: DocumentFrequencies) -> None: + for word, freq in model: + self._df[word] += freq + self._docs += model.docs + + def finalize(self, index: int, destdir: str): + df_model = OrderedDocumentFrequencies if self.ordered else DocumentFrequencies + df_model(log_level=self._log.level).construct(self._docs, self._df).prune( + self.min_docfreq + ).greatest(self.vocabulary_size).save( + output=self._save_path(index, destdir), series="id2vec" + ) + + @staticmethod + def _save_path(index: int, destdir: str): + if destdir.endswith(".asdf"): + return destdir + return os.path.join(destdir, "docfreq_%d.asdf" % index) diff --git a/sourced/ml/core/models/ordered_df.py b/sourced/ml/core/models/ordered_df.py new file mode 100644 index 0000000..63c8647 --- /dev/null +++ b/sourced/ml/core/models/ordered_df.py @@ -0,0 +1,61 @@ +from typing import Dict, Iterable, List + +from modelforge import merge_strings, register_model, split_strings +import numpy + +from sourced.ml.core.models import DocumentFrequencies + + +@register_model +class OrderedDocumentFrequencies(DocumentFrequencies): + """ + Compatible with the original DocumentFrequencies. This model maintains the determinitic + sequence of the tokens. + """ + + # NAME is the same + + def construct(self, docs: int, tokfreqs: Iterable[Dict[str, int]]): + super().construct(docs, tokfreqs) + self._log.info("Ordering the keys...") + keys = sorted(self._df) + self._order = {k: i for i, k in enumerate(keys)} + return self + + @property + def order(self) -> Dict[str, int]: + return self._order + + def tokens(self) -> List[str]: + arr = [None for _ in range(len(self))] + for k, v in self.order.items(): + arr[v] = k + return arr + + def _load_tree(self, tree): + tokens = split_strings(tree["tokens"]) + super()._load_tree(tree, tokens) + self._log.info("Mapping the keys order...") + self._order = {k: i for i, k in enumerate(tokens)} + + def _generate_tree(self): + tokens = [None] * len(self) + freqs = numpy.zeros(len(self), dtype=numpy.float32) + for k, i in self._order.items(): + tokens[i] = k + freqs[i] = self._df[k] + return {"docs": self.docs, "tokens": merge_strings(tokens), "freqs": freqs} + + def prune(self, threshold: int) -> "OrderedDocumentFrequencies": + pruned = super().prune(threshold) + if pruned is not self: + self._log.info("Recovering the order...") + pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))} + return pruned + + def greatest(self, max_size: int) -> "OrderedDocumentFrequencies": + pruned = super().greatest(max_size) + if pruned is not self: + self._log.info("Recovering the order...") + pruned._order = {k: i for i, k in enumerate(sorted(pruned._df))} + return pruned diff --git a/sourced/ml/core/models/quant.py b/sourced/ml/core/models/quant.py new file mode 100644 index 0000000..a213377 --- /dev/null +++ b/sourced/ml/core/models/quant.py @@ -0,0 +1,66 @@ +from typing import Dict + +from modelforge import merge_strings, Model, register_model, split_strings +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class QuantizationLevels(Model): + """ + This model contains quantization levels for multiple schemes (feature types). + Every feature "class" (type, possible distinct value) corresponds to the numpy array + with integer level borders. The size of each numpy array is (the number of levels + 1). + """ + + NAME = "quant" + VENDOR = "source{d}" + DESCRIPTION = "Model that contains quantization levels for multiple schemes (feature types)." + LICENSE = DEFAULT_LICENSE + + def construct(self, levels: Dict[str, Dict[str, numpy.ndarray]]): + self._levels = levels + return self + + @property + def levels(self) -> Dict[str, Dict[str, numpy.ndarray]]: + return self._levels + + def __len__(self): + return len(self.levels) + + def _load_tree(self, tree): + self._levels = {} + for key, vals in tree["schemes"].items(): + classes = split_strings(vals["classes"]) + levels = vals["levels"] + self.levels[key] = dict(zip(classes, numpy.split(levels, len(classes)))) + + def _generate_tree(self): + tree = {"schemes": {}} + for key, vals in self.levels.items(): + tree["schemes"][key] = scheme = {} + npartitions = len(next(iter(vals.values()))) + classes = [None for _ in range(len(vals))] + scheme["levels"] = levels = numpy.zeros(len(vals) * npartitions, dtype=numpy.int32) + for i, pair in enumerate(vals.items()): + classes[i], levels[i * npartitions : (i + 1) * npartitions] = pair + scheme["classes"] = merge_strings(classes) + return tree + + def dump(self): + return """Schemes: %s""" % ( + sorted( + (v[0], "%d@%d" % (len(v[1]), len(next(iter(v[1].values()))) - 1)) + for v in self.levels.items() + ) + ) + + def apply_quantization(self, extractors): + for extractor in extractors: + try: + extractor.quantize + except AttributeError: + continue + extractor.uast_to_bag.levels = self._levels[extractor.NAME] diff --git a/sourced/ml/core/models/tensorflow.py b/sourced/ml/core/models/tensorflow.py new file mode 100644 index 0000000..9a3e185 --- /dev/null +++ b/sourced/ml/core/models/tensorflow.py @@ -0,0 +1,55 @@ +from typing import List + +from modelforge import Model, register_model +import numpy + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class TensorFlowModel(Model): + """ + TensorFlow Protobuf model exported in the Modelforge format with GraphDef inside. + """ + + NAME = "tensorflow-model" + VENDOR = "source{d}" + DESCRIPTION = "TensorFlow Protobuf model that contains a GraphDef instance." + LICENSE = DEFAULT_LICENSE + + def construct( + self, + graphdef: "tensorflow.GraphDef" = None, # noqa: F821 + session: "tensorflow.Session" = None, # noqa: F821 + outputs: List[str] = None, + ): + if graphdef is None: + assert session is not None + assert outputs is not None + graphdef = session.graph_def + from tensorflow.python.framework import graph_util + + for node in graphdef.node: + node.device = "" + graphdef = graph_util.convert_variables_to_constants(session, graphdef, outputs) + self._graphdef = graphdef + return self + + @property + def graphdef(self): + """ + Returns the wrapped TensorFlow GraphDef. + """ + return self._graphdef + + def _generate_tree(self) -> dict: + return { + "graphdef": numpy.frombuffer(self._graphdef.SerializeToString(), dtype=numpy.uint8) + } + + def _load_tree(self, tree: dict): + from tensorflow.core.framework import graph_pb2 + + graphdef = graph_pb2.GraphDef() + graphdef.ParseFromString(tree["graphdef"].data) + self.construct(graphdef=graphdef) diff --git a/sourced/ml/core/models/topics.py b/sourced/ml/core/models/topics.py new file mode 100644 index 0000000..c463dda --- /dev/null +++ b/sourced/ml/core/models/topics.py @@ -0,0 +1,99 @@ +from typing import Union + +from modelforge import ( + assemble_sparse_matrix, + disassemble_sparse_matrix, + merge_strings, + Model, + register_model, + split_strings, +) + +from sourced.ml.core.models.license import DEFAULT_LICENSE + + +@register_model +class Topics(Model): + NAME = "topics" + VENDOR = "source{d}" + DESCRIPTION = "Model that is used to identify topics of source code repositories." + LICENSE = DEFAULT_LICENSE + + @property + def tokens(self): + return self._tokens + + @property + def topics(self): + """ + May be None if no topics are labeled. + """ + return self._topics + + @property + def matrix(self): + """ + Rows: tokens + Columns: topics + """ + return self._matrix + + def construct(self, tokens: list, topics: Union[list, None], matrix): + if len(tokens) != matrix.shape[1]: + raise ValueError("Tokens and matrix do not match.") + self._tokens = tokens + self._topics = topics + self._matrix = matrix + return self + + def _load_tree(self, tree: dict) -> None: + self.construct( + split_strings(tree["tokens"]), + split_strings(tree["topics"]) if tree["topics"] else None, + assemble_sparse_matrix(tree["matrix"]), + ) + + def dump(self) -> str: + res = "%d topics, %d tokens\nFirst 10 tokens: %s\nTopics: " % ( + self.matrix.shape + (self.tokens[:10],) + ) + if self.topics is not None: + res += "labeled, first 10: %s\n" % self.topics[:10] + else: + res += "unlabeled\n" + nnz = self.matrix.getnnz() + res += "non-zero elements: %d (%f)" % ( + nnz, + nnz / (self.matrix.shape[0] * self.matrix.shape[1]), + ) + return res + + def _generate_tree(self): + return { + "tokens": merge_strings(self.tokens), + "topics": merge_strings(self.topics) if self.topics is not None else False, + "matrix": disassemble_sparse_matrix(self.matrix), + } + + def __len__(self): + """ + Returns the number of topics. + """ + return self.matrix.shape[0] + + def __getitem__(self, item): + """ + Returns the keywords sorted by significance from topic index. + """ + row = self.matrix[item] + nnz = row.nonzero()[1] + pairs = [(-row[0, i], i) for i in nnz] + pairs.sort() + return [(self.tokens[pair[1]], -pair[0]) for pair in pairs] + + def label_topics(self, labels): + if len(labels) != len(self): + raise ValueError("Sizes do not match: %d != %d" % (len(labels), len(self))) + if not isinstance(labels[0], str): + raise TypeError("Labels must be strings") + self._topics = list(labels) diff --git a/sourced/ml/core/tests/__init__.py b/sourced/ml/core/tests/__init__.py new file mode 100644 index 0000000..d017800 --- /dev/null +++ b/sourced/ml/core/tests/__init__.py @@ -0,0 +1,26 @@ +import sys + +from modelforge import slogging + + +utmain = sys.modules["__main__"] +if utmain.__package__ == "unittest" and utmain.__spec__ is None: + from collections import namedtuple + + ModuleSpec = namedtuple("ModuleSpec", ["name"]) + utmain.__spec__ = ModuleSpec("unittest.__main__") + del ModuleSpec +del utmain + + +def has_tensorflow(): + try: + import tensorflow # noqa + + return True + except ImportError: + return False + + +def setup(): + slogging.setup("INFO", False) diff --git a/sourced/ml/core/tests/asdf/bow.asdf b/sourced/ml/core/tests/asdf/bow.asdf new file mode 100644 index 0000000..26b8ea0 Binary files /dev/null and b/sourced/ml/core/tests/asdf/bow.asdf differ diff --git a/sourced/ml/core/tests/asdf/coocc.asdf b/sourced/ml/core/tests/asdf/coocc.asdf new file mode 100644 index 0000000..9498b99 Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc.asdf differ diff --git a/sourced/ml/core/tests/asdf/coocc_df.asdf b/sourced/ml/core/tests/asdf/coocc_df.asdf new file mode 100644 index 0000000..b40f5d7 Binary files /dev/null and b/sourced/ml/core/tests/asdf/coocc_df.asdf differ diff --git a/sourced/ml/core/tests/asdf/docfreq_1000.asdf b/sourced/ml/core/tests/asdf/docfreq_1000.asdf new file mode 100644 index 0000000..2fa308d Binary files /dev/null and b/sourced/ml/core/tests/asdf/docfreq_1000.asdf differ diff --git a/sourced/ml/core/tests/asdf/id2vec_1000.asdf b/sourced/ml/core/tests/asdf/id2vec_1000.asdf new file mode 100644 index 0000000..d410d31 Binary files /dev/null and b/sourced/ml/core/tests/asdf/id2vec_1000.asdf differ diff --git a/sourced/ml/core/tests/asdf/quant.asdf b/sourced/ml/core/tests/asdf/quant.asdf new file mode 100644 index 0000000..793c4ba Binary files /dev/null and b/sourced/ml/core/tests/asdf/quant.asdf differ diff --git a/sourced/ml/core/tests/asdf/topics.asdf b/sourced/ml/core/tests/asdf/topics.asdf new file mode 100644 index 0000000..50b89dd Binary files /dev/null and b/sourced/ml/core/tests/asdf/topics.asdf differ diff --git a/sourced/ml/core/tests/asdf/uast.asdf b/sourced/ml/core/tests/asdf/uast.asdf new file mode 100644 index 0000000..8ca7458 Binary files /dev/null and b/sourced/ml/core/tests/asdf/uast.asdf differ diff --git a/sourced/ml/core/tests/asdf/voccoocc.asdf b/sourced/ml/core/tests/asdf/voccoocc.asdf new file mode 100644 index 0000000..835ab91 Binary files /dev/null and b/sourced/ml/core/tests/asdf/voccoocc.asdf differ diff --git a/sourced/ml/core/tests/identifiers.csv.tar.gz b/sourced/ml/core/tests/identifiers.csv.tar.gz new file mode 100644 index 0000000..4fac851 Binary files /dev/null and b/sourced/ml/core/tests/identifiers.csv.tar.gz differ diff --git a/sourced/ml/core/tests/models.py b/sourced/ml/core/tests/models.py new file mode 100644 index 0000000..6eeb379 --- /dev/null +++ b/sourced/ml/core/tests/models.py @@ -0,0 +1,25 @@ +from os.path import dirname, join + +_root = dirname(__file__) +_models_path = join(_root, "asdf") + +ID2VEC = join(_models_path, "id2vec_1000.asdf") +DOCFREQ = join(_models_path, "docfreq_1000.asdf") +QUANTLEVELS = join(_models_path, "quant.asdf") +BOW = join(_models_path, "bow.asdf") +COOCC = join(_models_path, "coocc.asdf") +COOCC_DF = join(_models_path, "coocc_df.asdf") +UAST = join(_models_path, "uast.asdf") +TOPICS = join(_models_path, "topics.asdf") + +DATA_DIR_SOURCE = join(_root, "source") +SOURCE_FILENAME = "example" +SOURCE = join(DATA_DIR_SOURCE, "%s.asdf" % SOURCE_FILENAME) +SOURCE_PY = join(DATA_DIR_SOURCE, "%s.py" % SOURCE_FILENAME) + +TOPICS_SRC = "topics_readable.txt" +PARQUET_DIR = join(_root, "parquet") +SIVA_DIR = join(_root, "siva") +IDENTIFIERS = join(_root, "identifiers.csv.tar.gz") + +MODER_FUNC = join(DATA_DIR_SOURCE, "example_functions.py") diff --git a/sourced/ml/core/tests/models_tensorflow/__init__.py b/sourced/ml/core/tests/models_tensorflow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/tests/models_tensorflow/test_tensorflow.py b/sourced/ml/core/tests/models_tensorflow/test_tensorflow.py new file mode 100644 index 0000000..ff9f3a2 --- /dev/null +++ b/sourced/ml/core/tests/models_tensorflow/test_tensorflow.py @@ -0,0 +1,34 @@ +import io +import unittest + +from sourced.ml.core.models.tensorflow import TensorFlowModel +from sourced.ml.core.tests import has_tensorflow + + +class TensorFlowModelTests(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_serialize(self): + import tensorflow as tf + + a = tf.constant([[1, 0], [0, 1]]) + b = tf.constant([[0, 1], [1, 0]]) + c = tf.matmul(a, b) + gd = tf.get_default_graph().as_graph_def() + buffer = io.BytesIO() + TensorFlowModel().construct(graphdef=gd).save(buffer, series="tensorflow-model") + buffer.seek(0) + model = TensorFlowModel().load(buffer) + self.assertEqual(gd.node, model.graphdef.node) + + buffer = io.BytesIO() + with tf.Session() as session: + TensorFlowModel().construct(session=session, outputs=[c.name[:-2]]).save( + buffer, series="tensorflow-model" + ) + buffer.seek(0) + model = TensorFlowModel().load(buffer) + self.assertEqual(gd.node, model.graphdef.node) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/source/__init__.py b/sourced/ml/core/tests/source/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sourced/ml/core/tests/source/example.py b/sourced/ml/core/tests/source/example.py new file mode 100644 index 0000000..ca4ecbd --- /dev/null +++ b/sourced/ml/core/tests/source/example.py @@ -0,0 +1,17 @@ +import sys + +from modelforge.logs import setup_logging + + +utmain = sys.modules["__main__"] +if utmain.__package__ == "unittest" and utmain.__spec__ is None: + from collections import namedtuple + + ModuleSpec = namedtuple("ModuleSpec", ["name"]) + utmain.__spec__ = ModuleSpec("unittest.__main__") + del ModuleSpec +del utmain + + +def setup(): + setup_logging("INFO") diff --git a/sourced/ml/core/tests/source/example_functions.py b/sourced/ml/core/tests/source/example_functions.py new file mode 100644 index 0000000..fd7d05a --- /dev/null +++ b/sourced/ml/core/tests/source/example_functions.py @@ -0,0 +1,16 @@ +class Foo: + def func_a(self): + # should be counted + pass + + +def func_b(): + # should be counted + pass + + +def func_c(): + # should be counted + def func_d(): + # should not be counted + pass diff --git a/sourced/ml/core/tests/swivel/col_sums.txt b/sourced/ml/core/tests/swivel/col_sums.txt new file mode 100644 index 0000000..44ea5c6 --- /dev/null +++ b/sourced/ml/core/tests/swivel/col_sums.txt @@ -0,0 +1,304 @@ +21 +58 +76 +59 +38 +92 +102 +102 +58 +26 +23 +44 +77 +68 +50 +49 +76 +25 +76 +49 +37 +44 +61 +75 +30 +90 +79 +102 +12 +3 +27 +74 +57 +62 +59 +58 +51 +100 +34 +75 +32 +113 +55 +19 +43 +65 +41 +49 +49 +39 +22 +22 +99 +37 +16 +29 +86 +49 +13 +14 +28 +44 +80 +50 +29 +20 +13 +154 +24 +76 +62 +70 +128 +27 +21 +39 +11 +45 +43 +124 +92 +80 +141 +88 +23 +24 +50 +52 +175 +43 +115 +12 +29 +16 +49 +48 +19 +95 +10 +63 +37 +102 +59 +21 +106 +76 +65 +50 +95 +59 +26 +29 +23 +21 +91 +78 +8 +78 +142 +50 +76 +112 +76 +61 +37 +133 +55 +93 +37 +19 +13 +16 +20 +32 +31 +31 +31 +64 +13 +58 +11 +21 +198 +77 +39 +50 +7 +59 +111 +12 +50 +102 +29 +141 +55 +58 +13 +39 +30 +22 +10 +27 +60 +40 +4 +58 +50 +76 +52 +74 +41 +59 +40 +74 +40 +40 +156 +73 +16 +32 +34 +31 +27 +93 +58 +31 +27 +19 +28 +64 +82 +44 +37 +37 +31 +62 +39 +95 +205 +25 +14 +18 +95 +26 +56 +10 +29 +59 +74 +24 +72 +19 +42 +18 +64 +33 +34 +54 +41 +51 +74 +41 +12 +9 +35 +25 +73 +39 +11 +76 +33 +9 +36 +52 +72 +27 +62 +45 +26 +149 +104 +64 +24 +64 +19 +26 +34 +21 +21 +22 +22 +97 +15 +61 +70 +27 +22 +85 +20 +107 +100 +104 +78 +17 +63 +40 +11 +141 +27 +30 +24 +78 +167 +32 +19 +89 +59 +46 +22 +6 +55 +50 +79 +5 +38 +12 +50 +97 +78 +29 +55 +72 +17 +95 +76 +59 +76 +61 +9 +45 +26 +32 +107 +19 \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/col_sums.txt.gz b/sourced/ml/core/tests/swivel/col_sums.txt.gz new file mode 100644 index 0000000..081051d Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_sums.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt b/sourced/ml/core/tests/swivel/col_vocab.txt new file mode 100644 index 0000000..78d2478 --- /dev/null +++ b/sourced/ml/core/tests/swivel/col_vocab.txt @@ -0,0 +1,304 @@ +i.access +i.action +i.activ +i.adapt +i.add +i.android +i.antonioleiva +i.app +i.append +i.appgroup +i.args +i.argument +i.argv +i.array +i.arrays +i.aslist +i.bar +i.build +i.bundle +i.button +i.call +i.captur +i.chdir +i.check +i.clear +i.click +i.close +i.com +i.command +i.communic +i.compil +i.confdir +i.conffil +i.conffilenam +i.config +i.conn +i.connect +i.content +i.copytre +i.count +i.counter +i.create +i.credenti +i.crypt +i.cursor +i.data +i.date +i.datetim +i.dbfile +i.delay +i.destdir +i.destfil +i.destroy +i.dev +i.develop +i.dict +i.dir +i.directori +i.dirnam +i.dirpath +i.dirs +i.docs +i.dropbox +i.edit +i.empty +i.endswith +i.env +i.error +i.euroclear +i.except +i.execut +i.exists +i.exit +i.expandus +i.ext +i.extens +i.factori +i.false +i.fetchon +i.file +i.filenam +i.files +i.find +i.finish +i.float +i.fname +i.for +i.format +i.get +i.getcwd +i.getenv +i.gethostbynam +i.getsiz +i.github +i.gone +i.handler +i.header +i.hide +i.hkey +i.home +i.host +i.impl +i.inflat +i.info +i.input +i.instanc +i.int +i.intent +i.interactor +i.invis +i.isdir +i.isempti +i.isfile +i.isoformat +i.item +i.items +i.iter +i.java +i.join +i.key +i.layout +i.len +i.length +i.line +i.linux +i.list +i.listdir +i.listen +i.listfil +i.lite +i.ljust +i.local +i.localtim +i.log +i.logdir +i.logfil +i.logfilenam +i.login +i.logsdir +i.long +i.lower +i.machin +i.main +i.make +i.makedir +i.master +i.math +i.menu +i.messag +i.min +i.move +i.mvpexampl +i.myping +i.name +i.navig +i.new +i.newenv +i.newfil +i.newscript +i.nmap +i.nmscan +i.node +i.none +i.number +i.object +i.old +i.onclick +i.oncreat +i.onfinish +i.onitem +i.onlogin +i.onopt +i.onpassword +i.onresum +i.onsuccess +i.onusernam +i.open +i.option +i.optpars +i.ospath +i.output +i.outputdir +i.outputfil +i.overrid +i.parent +i.parse +i.parser +i.pass +i.passwd +i.password +i.path +i.platform +i.popen +i.port +i.ports +i.posit +i.post +i.present +i.print +i.processor +i.profil +i.program +i.progress +i.putty +i.query +i.randint +i.random +i.range +i.raw +i.rdp +i.read +i.readfil +i.readlin +i.recycl +i.releas +i.remove +i.rename +i.replac +i.res +i.result +i.resume +i.ret +i.return +i.root +i.row +i.rows +i.run +i.runnabl +i.salt +i.saved +i.scan +i.scanner +i.screen +i.script +i.select +i.send +i.server +i.serverfil +i.session +i.set +i.show +i.shutil +i.sid +i.simple +i.sisdir +i.site +i.size +i.sleep +i.socket +i.sourcedir +i.sourcefil +i.split +i.splitext +i.sqlite +i.start +i.startswith +i.stat +i.state +i.stats +i.str +i.strftime +i.string +i.strip +i.subnet +i.subprocess +i.success +i.sum +i.sys +i.system +i.table +i.tablelist +i.test +i.text +i.tgt +i.thread +i.time +i.toast +i.today +i.todaystr +i.tofile +i.tohome +i.tostr +i.true +i.txt +i.type +i.usage +i.user +i.usernam +i.util +i.utils +i.valid +i.value +i.version +i.view +i.visibl +i.walk +i.widget +i.window +i.winreg +i.with +i.word +i.work +i.write +i.zip \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/col_vocab.txt.gz b/sourced/ml/core/tests/swivel/col_vocab.txt.gz new file mode 100644 index 0000000..73b3620 Binary files /dev/null and b/sourced/ml/core/tests/swivel/col_vocab.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/row_sums.txt b/sourced/ml/core/tests/swivel/row_sums.txt new file mode 100644 index 0000000..44ea5c6 --- /dev/null +++ b/sourced/ml/core/tests/swivel/row_sums.txt @@ -0,0 +1,304 @@ +21 +58 +76 +59 +38 +92 +102 +102 +58 +26 +23 +44 +77 +68 +50 +49 +76 +25 +76 +49 +37 +44 +61 +75 +30 +90 +79 +102 +12 +3 +27 +74 +57 +62 +59 +58 +51 +100 +34 +75 +32 +113 +55 +19 +43 +65 +41 +49 +49 +39 +22 +22 +99 +37 +16 +29 +86 +49 +13 +14 +28 +44 +80 +50 +29 +20 +13 +154 +24 +76 +62 +70 +128 +27 +21 +39 +11 +45 +43 +124 +92 +80 +141 +88 +23 +24 +50 +52 +175 +43 +115 +12 +29 +16 +49 +48 +19 +95 +10 +63 +37 +102 +59 +21 +106 +76 +65 +50 +95 +59 +26 +29 +23 +21 +91 +78 +8 +78 +142 +50 +76 +112 +76 +61 +37 +133 +55 +93 +37 +19 +13 +16 +20 +32 +31 +31 +31 +64 +13 +58 +11 +21 +198 +77 +39 +50 +7 +59 +111 +12 +50 +102 +29 +141 +55 +58 +13 +39 +30 +22 +10 +27 +60 +40 +4 +58 +50 +76 +52 +74 +41 +59 +40 +74 +40 +40 +156 +73 +16 +32 +34 +31 +27 +93 +58 +31 +27 +19 +28 +64 +82 +44 +37 +37 +31 +62 +39 +95 +205 +25 +14 +18 +95 +26 +56 +10 +29 +59 +74 +24 +72 +19 +42 +18 +64 +33 +34 +54 +41 +51 +74 +41 +12 +9 +35 +25 +73 +39 +11 +76 +33 +9 +36 +52 +72 +27 +62 +45 +26 +149 +104 +64 +24 +64 +19 +26 +34 +21 +21 +22 +22 +97 +15 +61 +70 +27 +22 +85 +20 +107 +100 +104 +78 +17 +63 +40 +11 +141 +27 +30 +24 +78 +167 +32 +19 +89 +59 +46 +22 +6 +55 +50 +79 +5 +38 +12 +50 +97 +78 +29 +55 +72 +17 +95 +76 +59 +76 +61 +9 +45 +26 +32 +107 +19 \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/row_sums.txt.gz b/sourced/ml/core/tests/swivel/row_sums.txt.gz new file mode 100644 index 0000000..5680a5a Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_sums.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt b/sourced/ml/core/tests/swivel/row_vocab.txt new file mode 100644 index 0000000..78d2478 --- /dev/null +++ b/sourced/ml/core/tests/swivel/row_vocab.txt @@ -0,0 +1,304 @@ +i.access +i.action +i.activ +i.adapt +i.add +i.android +i.antonioleiva +i.app +i.append +i.appgroup +i.args +i.argument +i.argv +i.array +i.arrays +i.aslist +i.bar +i.build +i.bundle +i.button +i.call +i.captur +i.chdir +i.check +i.clear +i.click +i.close +i.com +i.command +i.communic +i.compil +i.confdir +i.conffil +i.conffilenam +i.config +i.conn +i.connect +i.content +i.copytre +i.count +i.counter +i.create +i.credenti +i.crypt +i.cursor +i.data +i.date +i.datetim +i.dbfile +i.delay +i.destdir +i.destfil +i.destroy +i.dev +i.develop +i.dict +i.dir +i.directori +i.dirnam +i.dirpath +i.dirs +i.docs +i.dropbox +i.edit +i.empty +i.endswith +i.env +i.error +i.euroclear +i.except +i.execut +i.exists +i.exit +i.expandus +i.ext +i.extens +i.factori +i.false +i.fetchon +i.file +i.filenam +i.files +i.find +i.finish +i.float +i.fname +i.for +i.format +i.get +i.getcwd +i.getenv +i.gethostbynam +i.getsiz +i.github +i.gone +i.handler +i.header +i.hide +i.hkey +i.home +i.host +i.impl +i.inflat +i.info +i.input +i.instanc +i.int +i.intent +i.interactor +i.invis +i.isdir +i.isempti +i.isfile +i.isoformat +i.item +i.items +i.iter +i.java +i.join +i.key +i.layout +i.len +i.length +i.line +i.linux +i.list +i.listdir +i.listen +i.listfil +i.lite +i.ljust +i.local +i.localtim +i.log +i.logdir +i.logfil +i.logfilenam +i.login +i.logsdir +i.long +i.lower +i.machin +i.main +i.make +i.makedir +i.master +i.math +i.menu +i.messag +i.min +i.move +i.mvpexampl +i.myping +i.name +i.navig +i.new +i.newenv +i.newfil +i.newscript +i.nmap +i.nmscan +i.node +i.none +i.number +i.object +i.old +i.onclick +i.oncreat +i.onfinish +i.onitem +i.onlogin +i.onopt +i.onpassword +i.onresum +i.onsuccess +i.onusernam +i.open +i.option +i.optpars +i.ospath +i.output +i.outputdir +i.outputfil +i.overrid +i.parent +i.parse +i.parser +i.pass +i.passwd +i.password +i.path +i.platform +i.popen +i.port +i.ports +i.posit +i.post +i.present +i.print +i.processor +i.profil +i.program +i.progress +i.putty +i.query +i.randint +i.random +i.range +i.raw +i.rdp +i.read +i.readfil +i.readlin +i.recycl +i.releas +i.remove +i.rename +i.replac +i.res +i.result +i.resume +i.ret +i.return +i.root +i.row +i.rows +i.run +i.runnabl +i.salt +i.saved +i.scan +i.scanner +i.screen +i.script +i.select +i.send +i.server +i.serverfil +i.session +i.set +i.show +i.shutil +i.sid +i.simple +i.sisdir +i.site +i.size +i.sleep +i.socket +i.sourcedir +i.sourcefil +i.split +i.splitext +i.sqlite +i.start +i.startswith +i.stat +i.state +i.stats +i.str +i.strftime +i.string +i.strip +i.subnet +i.subprocess +i.success +i.sum +i.sys +i.system +i.table +i.tablelist +i.test +i.text +i.tgt +i.thread +i.time +i.toast +i.today +i.todaystr +i.tofile +i.tohome +i.tostr +i.true +i.txt +i.type +i.usage +i.user +i.usernam +i.util +i.utils +i.valid +i.value +i.version +i.view +i.visibl +i.walk +i.widget +i.window +i.winreg +i.with +i.word +i.work +i.write +i.zip \ No newline at end of file diff --git a/sourced/ml/core/tests/swivel/row_vocab.txt.gz b/sourced/ml/core/tests/swivel/row_vocab.txt.gz new file mode 100644 index 0000000..5dd2fff Binary files /dev/null and b/sourced/ml/core/tests/swivel/row_vocab.txt.gz differ diff --git a/sourced/ml/core/tests/swivel/shard-000-000.pb b/sourced/ml/core/tests/swivel/shard-000-000.pb new file mode 100644 index 0000000..dec3c38 Binary files /dev/null and b/sourced/ml/core/tests/swivel/shard-000-000.pb differ diff --git a/sourced/ml/core/tests/swivel/shard-000-000.pb.gz b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz new file mode 100644 index 0000000..7f2bf35 Binary files /dev/null and b/sourced/ml/core/tests/swivel/shard-000-000.pb.gz differ diff --git a/sourced/ml/core/tests/test_bblfsh_utils.py b/sourced/ml/core/tests/test_bblfsh_utils.py new file mode 100644 index 0000000..7bad09f --- /dev/null +++ b/sourced/ml/core/tests/test_bblfsh_utils.py @@ -0,0 +1,84 @@ +import errno +import os +import random +import socket +import time +import unittest + +import docker.client + +from sourced.ml.core.utils.bblfsh import BBLFSH_VERSION_HIGH, BBLFSH_VERSION_LOW, check_version + + +@unittest.skipIf(os.getenv("SKIP_BBLFSH_UTILS_TESTS", False), "Skip core.utils.bblfsh tests.") +class BblfshUtilsTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.docker_client = docker.from_env() + # ensure docker is running + try: + cls.docker_client.containers.list() + except Exception: + raise Exception("docker not running properly") + cls.er_msg = "supported bblfshd versions: " ">=%s,<%s" % ( + BBLFSH_VERSION_LOW, + BBLFSH_VERSION_HIGH, + ) + + def __check_bblfsh_version_support(self, version: str) -> bool: + """ + :param version: version of bblfshd to check + :return: True if version is supported, False otherwise + """ + with socket.socket() as s: + for _ in range(3): + try: + port = random.randint(10000, 50000) + s.connect(("localhost", port)) + except socket.error as e: + if e.errno == errno.ECONNREFUSED: + break + + container = self.docker_client.containers.run( + image="bblfsh/bblfshd:%s" % version, privileged=True, detach=True, ports={"9432": port} + ) + + assert container is not None, "failed to create bblfsh container" + + for _ in range(10): + try: + res = check_version(port=port) + break + except Exception: + time.sleep(0.1) + pass + + container.stop() + container.remove() + return res + + def test_v200(self): + self.assertFalse(self.__check_bblfsh_version_support("v2.0.0"), self.er_msg) + + def test_v210(self): + self.assertFalse(self.__check_bblfsh_version_support("v2.1.0"), self.er_msg) + + def test_v220(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.2.0"), self.er_msg) + + def test_v230(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.3.0"), self.er_msg) + + def test_v240(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.4.0"), self.er_msg) + + def test_v250(self): + self.assertTrue(self.__check_bblfsh_version_support("v2.5.0"), self.er_msg) + + @classmethod + def tearDownClass(cls): + cls.docker_client.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_bigartm.py b/sourced/ml/core/tests/test_bigartm.py new file mode 100644 index 0000000..1228c13 --- /dev/null +++ b/sourced/ml/core/tests/test_bigartm.py @@ -0,0 +1,29 @@ +import argparse +import os +import subprocess +import tempfile +import unittest + +from sourced.ml.core.utils import install_bigartm + + +class BigartmTests(unittest.TestCase): + gitdir = os.path.join(os.path.dirname(__file__), "..", "..") + + @unittest.skipUnless(os.getenv("FULL_TEST", False), "Need to define FULL_TEST env var.") + def test_install_bigartm(self): + with tempfile.TemporaryDirectory() as tmpdir: + args = argparse.Namespace(output=tmpdir, tmpdir=None) + self.assertIsNone(install_bigartm(args)) + self._valivate_bigartm(tmpdir) + + def _valivate_bigartm(self, tmpdir): + bigartm = os.path.join(tmpdir, "bigartm") + self.assertTrue(os.path.isfile(bigartm)) + self.assertEqual(os.stat(bigartm).st_mode & 0o777, 0o777) + output = subprocess.check_output([bigartm], stderr=subprocess.STDOUT) + self.assertIn("BigARTM v", output.decode()) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_bow.py b/sourced/ml/core/tests/test_bow.py new file mode 100644 index 0000000..4d054d0 --- /dev/null +++ b/sourced/ml/core/tests/test_bow.py @@ -0,0 +1,44 @@ +from io import BytesIO +import unittest + +import numpy + +from sourced.ml.core.models import BOW +import sourced.ml.core.tests.models as paths + + +class BOWTests(unittest.TestCase): + def setUp(self): + self.model = BOW().load(source=paths.BOW) + + def test_getitem(self): + repo_name, indices, weights = self.model[0] + self.assertEqual(repo_name, "repo1") + self.assertIsInstance(indices, numpy.ndarray) + self.assertIsInstance(weights, numpy.ndarray) + self.assertEqual(indices.shape, weights.shape) + self.assertEqual(indices.shape, (3,)) + + def test_iter(self): + pumped = list(self.model) + self.assertEqual(len(pumped), 5) + self.assertEqual(pumped, list(range(5))) + + def test_len(self): + self.assertEqual(len(self.model), 5) + + def test_tokens(self): + self.assertEqual(self.model.tokens[0], "i.") + + def test_write(self): + buffer = BytesIO() + self.model.save(output=buffer, series="bow-docfreq") + buffer.seek(0) + new_model = BOW().load(buffer) + self.assertEqual((self.model.matrix != new_model.matrix).nnz, 0) + self.assertEqual(self.model.documents, new_model.documents) + self.assertEqual(self.model.tokens, new_model.tokens) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_coocc.py b/sourced/ml/core/tests/test_coocc.py new file mode 100644 index 0000000..ee31d63 --- /dev/null +++ b/sourced/ml/core/tests/test_coocc.py @@ -0,0 +1,41 @@ +import unittest + +from sourced.ml.core.models.coocc import Cooccurrences +import sourced.ml.core.tests.models as paths + + +class CooccurrencesTests(unittest.TestCase): + def setUp(self): + self.model = Cooccurrences().load(source=paths.COOCC) + + def test_tokens(self): + tokens = self.model.tokens + self.assertIsInstance(tokens, list) + self.assertEqual( + tokens[:10], + [ + "i.set", + "i.iter", + "i.error", + "i.logsdir", + "i.read", + "i.captur", + "i.clear", + "i.android", + "i.tohome", + "i.ljust", + ], + ) + self.assertEqual(len(tokens), 304) + + def test_matrix(self): + matrix = self.model.matrix + self.assertEqual(matrix.shape, (304, 304)) + self.assertEqual(matrix.getnnz(), 16001) + + def test_len(self): + self.assertEqual(len(self.model), 304) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_df.py b/sourced/ml/core/tests/test_df.py new file mode 100644 index 0000000..b7b2872 --- /dev/null +++ b/sourced/ml/core/tests/test_df.py @@ -0,0 +1,101 @@ +from io import BytesIO +import unittest + +from sourced.ml.core.models import DocumentFrequencies +import sourced.ml.core.tests.models as paths + + +class DocumentFrequenciesTests(unittest.TestCase): + def setUp(self): + self.model = DocumentFrequencies().load(source=paths.DOCFREQ) + + def test_docs(self): + docs = self.model.docs + self.assertIsInstance(docs, int) + self.assertEqual(docs, 1000) + + def test_get(self): + self.assertEqual(self.model["aaaaaaa"], 341) + with self.assertRaises(KeyError): + print(self.model["xaaaaaa"]) + self.assertEqual(self.model.get("aaaaaaa", 0), 341) + self.assertEqual(self.model.get("xaaaaaa", 100500), 100500) + + def test_tokens(self): + self.assertEqual(list(self.model._df), self.model.tokens()) + + def test_len(self): + # the remaining 18 are not unique - the model was generated badly + self.assertEqual(len(self.model), 982) + + def test_iter(self): + aaa = False + for tok, freq in self.model: + if "aaaaaaa" in tok: + aaa = True + int(freq) + break + self.assertTrue(aaa) + + def test_prune(self): + pruned = self.model.prune(4) + for _, freq in pruned: + self.assertGreaterEqual(freq, 4) + self.assertEqual(len(pruned), 346) + + def test_prune_self(self): + pruned = self.model.prune(1) + self.assertIs(self.model, pruned) + + def test_greatest(self): + pruned = self.model.greatest(100) + freqs = [v for v in self.model._df.values()] + freqs.sort(reverse=True) + border = freqs[100] + for v in pruned._df.values(): + self.assertGreaterEqual(v, border) + df1 = pruned._df + df2 = self.model.greatest(100)._df + self.assertEqual(df1, df2) + + def test_greatest2(self): + df = DocumentFrequencies().construct(100, {str(x): x for x in range(1000)}) + df_greatest_true = {str(x): x for x in range(500, 1000)} + df_greatest = df.greatest(500) + self.assertEqual(df_greatest._df, df_greatest_true) + + df._df["500a"] = 500 + df._df["500b"] = 500 + df._df["500c"] = 500 + df._df["500d"] = 500 + df._df["500e"] = 500 + + df_greatest = df.greatest(500) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["500a"] = 500 + df_greatest = df.greatest(501) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["500b"] = 500 + df_greatest_true["500c"] = 500 + df_greatest_true["500d"] = 500 + df_greatest_true["500e"] = 500 + df_greatest = df.greatest(505) + self.assertEqual(df_greatest._df, df_greatest_true) + + df_greatest_true["499"] = 499 + df_greatest = df.greatest(506) + self.assertEqual(df_greatest._df, df_greatest_true) + + def test_write(self): + buffer = BytesIO() + self.model.save(buffer) + buffer.seek(0) + new_model = DocumentFrequencies().load(buffer) + self.assertEqual(self.model._df, new_model._df) + self.assertEqual(self.model.docs, new_model.docs) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_dump.py b/sourced/ml/core/tests/test_dump.py new file mode 100644 index 0000000..1370280 --- /dev/null +++ b/sourced/ml/core/tests/test_dump.py @@ -0,0 +1,144 @@ +import argparse +from contextlib import contextmanager +from io import StringIO +import logging +import os +import shutil +import sys +import tempfile +import unittest + +from modelforge.tools import dump_model + +import sourced.ml.core.tests.models as paths + + +cache_dir = os.path.join(tempfile.gettempdir(), "ml-test-dump") + + +@contextmanager +def captured_output(): + log = StringIO() + log_handler = logging.StreamHandler(log) + logging.getLogger().addHandler(log_handler) + new_out, new_err = StringIO(), StringIO() + old_out, old_err = sys.stdout, sys.stderr + try: + sys.stdout, sys.stderr = new_out, new_err + yield sys.stdout, sys.stderr, log + finally: + sys.stdout, sys.stderr = old_out, old_err + logging.getLogger().removeHandler(log_handler) + + +class DumpTests(unittest.TestCase): + ID2VEC_DUMP = """{'created_at': '2017-06-18 17:37:06', \ +'dependencies': [], \ +'license': 'ODbL-1.0', \ +'model': 'id2vec', \ +'series': 'id2vec-1000', \ +'size': '1.1 MB', \ +'uuid': '92609e70-f79c-46b5-8419-55726e873cfc', \ +'vendor': 'source{d}', \ +'version': [1, 0, 0]} +Shape: (1000, 300) +First 10 words: ['get', 'name', 'type', 'string', 'class', 'set', 'data', 'value', 'self', 'test'] +""" + + DOCFREQ_DUMP = ( + """{'created_at': '2017-08-09 16:49:12', \ +'dependencies': [], \ +'license': 'ODbL-1.0', \ +'model': 'docfreq', \ +'series': 'docfreq-1000', \ +'size': '6.4 kB', \ +'uuid': 'f64bacd4-67fb-4c64-8382-399a8e7db52a', \ +'vendor': 'source{d}', \ +'version': [0, 1, 0]} +Number of words: 982 +""" + + "Random 10 words: " + ) + + BOW_DUMP = """{'created_at': '2018-01-18 21:59:59', \ +'dependencies': [{'created_at': datetime.datetime(2018, 1, 18, 21, 59, 48, 828287), \ +'dependencies': [], \ +'model': 'docfreq', \ +'uuid': '2c4fcae7-93a6-496e-9e3a-d6e15d35b812', \ +'version': [1, 0, 0]}], \ +'license': 'ODbL-1.0', \ +'model': 'bow', \ +'parent': '51b4165d-b2c6-442a-93be-0eb35f4cc19a', \ +'series': 'bow-docfreq', \ +'size': '2.5 kB', \ +'uuid': '0d95f342-2c69-459f-9ee7-a1fc7da88d64', \ +'vendor': 'source{d}', \ +'version': [1, 0, 15]} +Shape: (5, 20) +First 10 documents: ['repo1', 'repo2', 'repo3', 'repo4', 'repo5'] +First 10 tokens: ['i.', 'i.*', 'i.Activity', 'i.AdapterView', 'i.ArrayAdapter', 'i.Arrays', 'i.Bundle', 'i.EditText', 'i.Exception', 'i.False']\n""" # noqa + + COOCC_DUMP = ( + """{'created_at': '2018-01-24 16:00:02', \ +'dependencies': [{'created_at': datetime.datetime(2018, 1, 24, 15, 59, 24, 129470), \ +'dependencies': [], \ +'model': 'docfreq', \ +'uuid': '0f94a6c6-7dc3-4b3c-b8d2-917164a50581', \ +'version': [1, 0, 0]}], \ +'license': 'ODbL-1.0', \ +'model': 'co-occurrences', \ +'series': 'coocc-docfreq', \ +'size': '79.9 kB', \ +'uuid': 'e75dcb2d-ec1d-476b-a04b-bc64c7779ae1', \ +'vendor': 'source{d}', \ +'version': [1, 0, 0]} +Number of words: 304 +First 10 words: ['i.set', 'i.iter', 'i.error', 'i.logsdir', 'i.read', 'i.captur', 'i.clear',""" + + """ 'i.android', 'i.tohome', 'i.ljust'] +Matrix: shape: (304, 304) non-zero: 16001 +""" + ) + + def tearDown(self): + if os.path.exists(cache_dir): + shutil.rmtree(cache_dir) + + def test_id2vec(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.ID2VEC)) + self.assertEqual(out.getvalue(), self.ID2VEC_DUMP) + + def test_docfreq(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.DOCFREQ)) + self.assertEqual(out.getvalue()[: len(self.DOCFREQ_DUMP)], self.DOCFREQ_DUMP) + ending = "\nNumber of documents: 1000\n" + self.assertEqual(out.getvalue()[-len(ending) :], ending) + + def test_bow(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.BOW)) + self.assertEqual(out.getvalue(), self.BOW_DUMP) + + def test_coocc(self): + with captured_output() as (out, _, _): + dump_model(self._get_args(input=paths.COOCC)) + self.assertEqual(out.getvalue(), self.COOCC_DUMP) + + @staticmethod + def _get_args(input): + return argparse.Namespace( + input=input, + backend=None, + args=None, + username="", + password="", + index_repo="https://github.com/src-d/models", + cache=cache_dir, + signoff=False, + log_level="WARNING", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_id_splitter_features.py b/sourced/ml/core/tests/test_id_splitter_features.py new file mode 100644 index 0000000..5d04aa0 --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_features.py @@ -0,0 +1,191 @@ +import io +import tarfile +import tempfile +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow +from sourced.ml.core.tests.models import IDENTIFIERS + + +def write_fake_identifiers(tar_file, n_lines, char_sizes, n_cols, text="a"): + """ + Prepare file with fake identifiers. + :param tar_file: ready to write file. + :param n_lines: number of lines to generate. + :param char_sizes: sizes of identifiers. + :param n_cols: number of columns. + :param text: text that is used to fill identifiers. + """ + # sanity check + if isinstance(char_sizes, int): + char_sizes = [char_sizes] * n_lines + assert len(char_sizes) == n_lines + + # generate file + res = [] + for sz in char_sizes: + line = ",".join([text * sz] * n_cols) + res.append(line) + content = "\n".join(res) + content = content.encode("utf-8") + + # add content to file + info = tarfile.TarInfo("identifiers.txt") + info.size = len(content) + tar_file.addfile(info, io.BytesIO(content)) + + +class IdSplitterTest(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_features(self): + from sourced.ml.core.algorithms.id_splitter.features import prepare_features + + # check feature extraction + text = "a a" + n_lines = 10 + max_identifier_len = 20 + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=n_lines, char_sizes=1, n_cols=2, text=text) + feat = prepare_features( + csv_path=tmp.name, + use_header=True, + identifier_col=0, + max_identifier_len=max_identifier_len, + split_identifier_col=1, + shuffle=True, + test_ratio=0.5, + padding="post", + ) + x_train, x_test, y_train, y_test = feat + # because of test_ratio=0.5 - shapes should be equal + self.assertEqual(x_test.shape, x_train.shape) + self.assertEqual(y_test.shape, y_train.shape) + # each line contains only one split -> so it should be only 5 nonzero for train/test + self.assertEqual(numpy.sum(y_test), 5) + self.assertEqual(numpy.sum(y_train), 5) + # each line contains only two chars -> so it should be only 10 nonzero for train/test + self.assertEqual(numpy.count_nonzero(x_test), 10) + self.assertEqual(numpy.count_nonzero(x_train), 10) + # y should be 3 dimensional matrix + self.assertEqual(y_test.ndim, 3) + self.assertEqual(y_train.ndim, 3) + # x should be 2 dimensional matrix + self.assertEqual(x_test.ndim, 2) + self.assertEqual(x_train.ndim, 2) + # check number of samples + self.assertEqual(x_test.shape[0] + x_train.shape[0], n_lines) + self.assertEqual(y_test.shape[0] + y_train.shape[0], n_lines) + # check max_identifier_len + self.assertEqual(x_test.shape[1], max_identifier_len) + self.assertEqual(x_train.shape[1], max_identifier_len) + self.assertEqual(y_test.shape[1], max_identifier_len) + self.assertEqual(y_train.shape[1], max_identifier_len) + + # normal file + try: + prepare_features( + csv_path=IDENTIFIERS, + use_header=True, + identifier_col=0, + max_identifier_len=max_identifier_len, + split_identifier_col=1, + shuffle=True, + test_ratio=0.5, + padding="post", + ) + except Exception as e: + self.fail("prepare_features raised %s with log %s" % (type(e), str(e))) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_read_identifiers(self): + from sourced.ml.core.algorithms.id_splitter.features import read_identifiers + + # read with header + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers( + csv_path=tmp.name, + use_header=True, + max_identifier_len=10, + identifier_col=3, + split_identifier_col=4, + ) + self.assertEqual(len(res), 10) + + # read without header + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers( + csv_path=tmp.name, + use_header=False, + max_identifier_len=10, + identifier_col=3, + split_identifier_col=4, + ) + self.assertEqual(len(res), 9) + + # read with max_identifier_len equal to 0 -> expect empty list + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=1, n_cols=5) + + res = read_identifiers( + csv_path=tmp.name, + use_header=True, + max_identifier_len=0, + identifier_col=3, + split_identifier_col=4, + ) + self.assertEqual(len(res), 0) + + # generate temporary file with identifiers of specific lengths and filter by length + char_sizes = list(range(1, 11)) + + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=5) + + # check filtering + # read last two columns as identifiers + for i in range(11): + res = read_identifiers( + csv_path=tmp.name, + use_header=True, + max_identifier_len=i, + identifier_col=3, + split_identifier_col=4, + ) + self.assertEqual(len(res), i) + + # read wrong columns + with tempfile.NamedTemporaryFile() as tmp: + with tarfile.open(None, "w", fileobj=tmp, encoding="utf-8") as tmp_tar: + write_fake_identifiers(tmp_tar, n_lines=10, char_sizes=char_sizes, n_cols=2) + + with self.assertRaises(IndexError): + read_identifiers( + csv_path=tmp.name, + use_header=True, + max_identifier_len=10, + identifier_col=3, + split_identifier_col=4, + ) + + # normal file + try: + read_identifiers( + csv_path=IDENTIFIERS, + use_header=True, + max_identifier_len=10, + identifier_col=3, + split_identifier_col=4, + ) + except Exception as e: + self.fail("read_identifiers raised %s with log %s" % (type(e), str(e))) diff --git a/sourced/ml/core/tests/test_id_splitter_nn_model.py b/sourced/ml/core/tests/test_id_splitter_nn_model.py new file mode 100644 index 0000000..581e1fc --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_nn_model.py @@ -0,0 +1,72 @@ +import string +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow + + +class MetricsTests(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_register_metric(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS + + fake_metric = "fake metric" + register_metric(fake_metric) + self.assertIn(fake_metric, METRICS) + METRICS.pop() + self.assertNotIn(fake_metric, METRICS) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_raise_register_metric(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import register_metric, METRICS + + bad_metric = 1 + with self.assertRaises(AssertionError): + register_metric(bad_metric) + self.assertNotIn(bad_metric, METRICS) + + +class ModelsTests(unittest.TestCase): + def setUp(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import build_rnn, build_cnn + + self.n_uniq = len(string.ascii_lowercase) + self.model_rnn = build_rnn( + maxlen=5, + units=24, + stack=2, + rnn_layer="LSTM", + optimizer="Adam", + dev0="/cpu:0", + dev1="/cpu:0", + ) + self.model_cnn = build_cnn( + maxlen=5, + filters=[64, 32, 16, 8], + output_n_filters=32, + stack=2, + kernel_sizes=[2, 4, 8, 16], + optimizer="Adam", + device="/cpu:0", + ) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_rnn(self): + self.assertTrue(self.model_rnn.built) + self.assertTrue(self.model_rnn.trainable) + self.assertIsInstance(self.model_rnn.get_weights()[0], numpy.ndarray) + self.assertEqual(self.model_rnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) + self.assertTrue(self.model_rnn.uses_learning_phase) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_cnn(self): + self.assertTrue(self.model_cnn.built) + self.assertTrue(self.model_cnn.trainable) + self.assertIsInstance(self.model_cnn.get_weights()[0], numpy.ndarray) + self.assertEqual(self.model_cnn.get_weights()[0].shape, (self.n_uniq + 1, self.n_uniq + 1)) + self.assertTrue(self.model_cnn.uses_learning_phase) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_id_splitter_pipeline.py b/sourced/ml/core/tests/test_id_splitter_pipeline.py new file mode 100644 index 0000000..9e712c7 --- /dev/null +++ b/sourced/ml/core/tests/test_id_splitter_pipeline.py @@ -0,0 +1,134 @@ +import tempfile +import unittest + +import numpy + +from sourced.ml.core.tests import has_tensorflow + + +class IdSplitterPipelineTest(unittest.TestCase): + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_binarize(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import binarize + + thresholds = [0, 0.09, 0.19, 0.29, 0.39, 0.49, 0.59, 0.69, 0.79, 0.89, 0.99] + n_pos = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + + for th, n_p in zip(thresholds, n_pos): + vals = numpy.arange(10) / 10 + res = binarize(vals, th) + self.assertEqual(sum(binarize(vals, th)), n_p) + if th in (0, 0.99): + self.assertEqual(numpy.unique(res).shape[0], 1) + else: + self.assertEqual(numpy.unique(res).shape[0], 2) + + vals = numpy.arange(10) / 10 + old_vals = vals.copy() + for th, n_p in zip(thresholds, n_pos): + res = binarize(vals, th, inplace=False) + self.assertEqual(sum(res), n_p) + self.assertTrue(numpy.array_equal(old_vals, vals)) + if th in (0, 0.99): + self.assertEqual(numpy.unique(res).shape[0], 1) + else: + self.assertEqual(numpy.unique(res).shape[0], 2) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_devices(self): + from sourced.ml.core.algorithms.id_splitter.nn_model import prepare_devices + + correct_args = ["1", "0,1", "-1"] + resulted_dev = [("/gpu:1", "/gpu:1"), ("/gpu:0", "/gpu:1"), ("/cpu:0", "/cpu:0")] + for res, arg in zip(resulted_dev, correct_args): + self.assertEqual(res, prepare_devices(arg)) + + bad_args = ["", "1,2,3"] + for arg in bad_args: + with self.assertRaises(ValueError): + prepare_devices(arg) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_schedule(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import build_schedule + + start_lr = 10 + end_lr = 1 + n_epochs = 9 + + lr_schedule = build_schedule(lr=start_lr, final_lr=end_lr, n_epochs=n_epochs) + + for i in range(n_epochs): + self.assertEqual(start_lr - i, lr_schedule(epoch=i)) + + with self.assertRaises(AssertionError): + lr_schedule(-1) + with self.assertRaises(AssertionError): + lr_schedule(n_epochs + 1) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_build_train_generator(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import build_train_generator + + batch_size = 3 + # mismatch number of samples + bad_x = numpy.zeros(3) + bad_y = numpy.zeros(4) + with self.assertRaises(AssertionError): + build_train_generator(bad_x, bad_y, batch_size=batch_size) + + # check generator with correct inputs + x = numpy.zeros(5) + gen = build_train_generator(x, x, batch_size=batch_size) + expected_n_samples = [3, 2] + for n_samples in expected_n_samples: + x_gen, y_gen = next(gen) + self.assertEqual(x_gen.shape, y_gen.shape) + self.assertEqual(n_samples, x_gen.shape[0]) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_train_parameters(self): + from sourced.ml.core.algorithms.id_splitter.pipeline import create_generator_params + + batch_size = 500 + samples_per_epoch = 10 ** 6 + n_samples = 40 * 10 ** 6 + epochs = 10 + + steps_per_epoch_ = samples_per_epoch // batch_size + n_epochs_ = numpy.ceil(epochs * n_samples / samples_per_epoch) + + steps_per_epoch, n_epochs = create_generator_params( + batch_size, samples_per_epoch, n_samples, epochs + ) + self.assertEqual(steps_per_epoch, steps_per_epoch_) + self.assertEqual(n_epochs, n_epochs_) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_config_keras(self): + from keras.backend.tensorflow_backend import get_session + from sourced.ml.core.algorithms.id_splitter.pipeline import config_keras + + config_keras() + sess = get_session() + self.assertTrue(sess._config.gpu_options.allow_growth) + + @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.") + def test_prepare_callbacks(self): + from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint + from sourced.ml.core.algorithms.id_splitter.pipeline import prepare_callbacks + + with tempfile.TemporaryDirectory() as tmpdir: + callbacks = prepare_callbacks(tmpdir) + + # TensorBoard + self.assertIsInstance(callbacks[0], TensorBoard) + self.assertTrue(callbacks[0].log_dir.startswith(tmpdir)) + + # CSVLogger + self.assertIsInstance(callbacks[1], CSVLogger) + self.assertTrue(callbacks[1].filename.startswith(tmpdir)) + + # ModelCheckpoint + self.assertIsInstance(callbacks[2], ModelCheckpoint) + self.assertTrue(callbacks[2].filepath.startswith(tmpdir)) diff --git a/sourced/ml/core/tests/test_inttypes_to_nodes.py b/sourced/ml/core/tests/test_inttypes_to_nodes.py new file mode 100644 index 0000000..998a7fd --- /dev/null +++ b/sourced/ml/core/tests/test_inttypes_to_nodes.py @@ -0,0 +1,40 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms import Uast2QuantizedChildren +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2NodesBagTest(unittest.TestCase): + def setUp(self): + self.nodes_bag_extractor = Uast2QuantizedChildren(npartitions=3) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.nodes_bag_extractor(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + def test_quantize_1(self): + freqs = {1: 100, 2: 90, 3: 10, 5: 10, 6: 5, 7: 5} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 3, 7]) + + def test_quantize_2(self): + freqs = {1: 10, 2: 10, 3: 10, 5: 10, 6: 10, 7: 10} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 3, 6, 7]) + + def test_quantize_3(self): + freqs = {1: 100, 2: 1, 3: 1, 5: 1, 6: 1, 7: 1} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 7, 7]) + + def test_quantize_4(self): + freqs = {1: 10, 2: 15, 3: 5, 5: 15, 6: 10, 7: 10} + levels = self.nodes_bag_extractor.quantize_unwrapped(freqs.items()) + self.assertEqual(list(levels), [1, 2, 5, 7]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_merge_bow.py b/sourced/ml/core/tests/test_merge_bow.py new file mode 100644 index 0000000..80d6b3c --- /dev/null +++ b/sourced/ml/core/tests/test_merge_bow.py @@ -0,0 +1,93 @@ +import os +import tempfile +import unittest + +import numpy +from scipy.sparse import csc_matrix + +from sourced.ml.core.models import BOW +from sourced.ml.core.models.model_converters.merge_bow import MergeBOW + + +class MergeBOWTests(unittest.TestCase): + def setUp(self): + self.model1 = BOW().construct( + ["doc_1", "doc_2", "doc_3"], + ["f.tok_1", "k.tok_2", "f.tok_3"], + csc_matrix( + (numpy.array([1, 2]), (numpy.array([0, 1]), numpy.array([1, 0]))), shape=(3, 3) + ), + ) + self.model1._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]} + self.model2 = BOW().construct( + ["doc_4", "doc_5", "doc_6"], + ["f.tok_1", "k.tok_2", "f.tok_3"], + csc_matrix( + (numpy.array([3, 4]), (numpy.array([0, 1]), numpy.array([1, 0]))), shape=(3, 3) + ), + ) + self.model2._meta = {"dependencies": [{"model": "docfreq", "uuid": "uuid"}]} + self.merge_results = [[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 3, 0], [4, 0, 0], [0, 0, 0]] + self.merge_bow = MergeBOW() + + def test_convert_model_base(self): + self.merge_bow.convert_model(self.model1) + self.assertListEqual(self.merge_bow.documents, ["doc_1", "doc_2", "doc_3"]) + self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, row in enumerate(self.merge_bow.matrix[0].toarray()): + self.assertListEqual(list(row), self.merge_results[i]) + self.assertEqual(self.merge_bow.deps, [{"uuid": "uuid", "model": "docfreq"}]) + self.merge_bow.convert_model(self.model2) + self.assertListEqual( + self.merge_bow.documents, ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"] + ) + self.assertListEqual(self.merge_bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, arr in enumerate(self.merge_bow.matrix): + for j, row in enumerate(arr.toarray()): + self.assertListEqual(list(row), self.merge_results[i * 3 + j]) + self.assertEqual(self.merge_bow.deps, [{"model": "docfreq", "uuid": "uuid"}]) + + def test_convert_model_error(self): + self.merge_bow.convert_model(self.model1) + self.model2._tokens = ["f.tok_1", "k.tok_2"] + with self.assertRaises(ValueError): + self.merge_bow.convert_model(self.model2) + self.model2._tokens = ["f.tok_1", "k.tok_2", "f.tok_3", "f.tok_4"] + with self.assertRaises(ValueError): + self.merge_bow.convert_model(self.model2) + + def test_finalize_base(self): + self.merge_bow.convert_model(self.model1) + self.merge_bow.convert_model(self.model2) + with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir: + dest = os.path.join(tmpdir, "bow.asdf") + self.merge_bow.finalize(0, dest) + bow = BOW().load(dest) + self.assertListEqual( + bow.documents, ["doc_1", "doc_2", "doc_3", "doc_4", "doc_5", "doc_6"] + ) + self.assertListEqual(bow.tokens, ["f.tok_1", "k.tok_2", "f.tok_3"]) + for i, row in enumerate(bow.matrix.toarray()): + self.assertListEqual(list(row), self.merge_results[i]) + self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}]) + + def test_finalize_reduce(self): + self.merge_bow.convert_model(self.model1) + self.merge_bow.features_namespaces = "f." + with tempfile.TemporaryDirectory(prefix="merge-bow-") as tmpdir: + dest = os.path.join(tmpdir, "bow.asdf") + self.merge_bow.finalize(0, dest) + bow = BOW().load(dest) + self.assertListEqual(bow.documents, ["doc_1", "doc_2", "doc_3"]) + self.assertListEqual(bow.tokens, ["f.tok_1", "f.tok_3"]) + for i, row in enumerate(bow.matrix.toarray()): + self.assertListEqual(list(row), self.merge_results[i][::2]) + self.assertEqual(bow.meta["dependencies"], [{"uuid": "uuid", "model": "docfreq"}]) + + def test_save_path(self): + self.assertEqual(self.merge_bow._save_path(0, "bow.asdf"), "bow.asdf") + self.assertEqual(self.merge_bow._save_path(0, "bow"), os.path.join("bow", "bow_0.asdf")) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_merge_df.py b/sourced/ml/core/tests/test_merge_df.py new file mode 100644 index 0000000..8329d22 --- /dev/null +++ b/sourced/ml/core/tests/test_merge_df.py @@ -0,0 +1,40 @@ +import os +import tempfile +import unittest + +from sourced.ml.core.models import DocumentFrequencies +from sourced.ml.core.models.model_converters.merge_df import MergeDocFreq + + +class Model2BaseTests(unittest.TestCase): + def setUp(self): + self.model1 = DocumentFrequencies().construct(3, {"one": 1, "two": 2, "three": 3}) + self.model2 = DocumentFrequencies().construct(3, {"four": 4, "three": 3, "five": 5}) + self.merge_df = MergeDocFreq(min_docfreq=1, vocabulary_size=100) + self.merge_result = {"one": 1, "two": 2, "three": 6, "four": 4, "five": 5} + + def test_convert_model(self): + self.merge_df.convert_model(self.model1) + self.assertEqual(self.merge_df._docs, 3) + self.assertEqual(self.merge_df._df, self.model1._df) + self.merge_df.convert_model(self.model2) + self.assertEqual(self.merge_df._docs, 6) + self.assertEqual(self.merge_df._df, self.merge_result) + + def test_finalize(self): + self.merge_df.convert_model(self.model1) + self.merge_df.convert_model(self.model2) + with tempfile.TemporaryDirectory(prefix="merge-df-") as tmpdir: + dest = os.path.join(tmpdir, "df.asdf") + self.merge_df.finalize(0, dest) + df = DocumentFrequencies().load(dest) + self.assertEqual(df.docs, 6) + self.assertEqual(df._df, self.merge_result) + + def test_save_path(self): + self.assertEqual(self.merge_df._save_path(0, "df.asdf"), "df.asdf") + self.assertEqual(self.merge_df._save_path(0, "df"), os.path.join("df", "docfreq_0.asdf")) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_model2base.py b/sourced/ml/core/tests/test_model2base.py new file mode 100644 index 0000000..18a09c9 --- /dev/null +++ b/sourced/ml/core/tests/test_model2base.py @@ -0,0 +1,98 @@ +import os +import tempfile +import unittest + +from sourced.ml.core.models.model_converters.base import Model2Base + + +class FromModel: + NAME = "from" + meta = {"dependencies": ()} + + def __init__(self, **kwargs): + pass + + def load(self, source): + pass + + +class ToModel: + NAME = "to" + output = None + meta = {"dependencies": ()} + + def __init__(self, **kwargs): + pass + + def save(self, output, deps=None): + ToModel.output = output + + +class Model2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + finalized = False + + def convert_model(self, model): + return ToModel() + + +class MockingModel2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + finalized = False + + def convert_model(self, model): + return ToModel() + + def finalize(self, index: int, destdir: str): + self.finalized = True + + +class RaisingModel2Test(Model2Base): + MODEL_FROM_CLASS = FromModel + MODEL_TO_CLASS = ToModel + + def convert_model(self, model): + raise ValueError("happens") + + +class FakeQueue: + def __init__(self, contents: list): + self.contents = contents + + def get(self): + return self.contents.pop() + + def put(self, item): + self.contents.append(item) + + +class Model2BaseTests(unittest.TestCase): + def test_convert(self): + converter = Model2Test(num_processes=2) + with tempfile.TemporaryDirectory() as tmpdir: + status = converter.convert(os.listdir(os.path.dirname(__file__)), tmpdir) + self.assertGreater(status, 20) + + def test_process_entry(self): + converter = MockingModel2Test(num_processes=2) + queue_in = FakeQueue([None, "srcdir/job"]) + queue_out = FakeQueue([]) + with tempfile.TemporaryDirectory(prefix="sourced-ml-") as tmpdir: + converter._process_entry(0, os.path.join(tmpdir, "destdir"), queue_in, queue_out) + self.assertTrue(os.path.exists(os.path.join(tmpdir, "destdir"))) + self.assertEqual(ToModel.output, os.path.join(tmpdir, "destdir", "job")) + self.assertTrue(converter.finalized) + self.assertEqual(queue_out.contents, [("srcdir/job", True)]) + + def test_process_entry_exception(self): + converter = RaisingModel2Test(num_processes=2) + queue_in = FakeQueue([None, "srcdir/job"]) + queue_out = FakeQueue([]) + converter._process_entry(0, "destdir", queue_in, queue_out) + self.assertEqual(queue_out.contents, [("srcdir/job", False)]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_pickleable_logger.py b/sourced/ml/core/tests/test_pickleable_logger.py new file mode 100644 index 0000000..c572ada --- /dev/null +++ b/sourced/ml/core/tests/test_pickleable_logger.py @@ -0,0 +1,22 @@ +import logging +import pickle +import unittest + +from sourced.ml.core.utils import PickleableLogger + + +class TestLogger(PickleableLogger): + def _get_log_name(self): + return "test" + + +class PickleableLoggerTests(unittest.TestCase): + def test_pickle(self): + logger = TestLogger(log_level=logging.ERROR) + logger = pickle._loads(pickle._dumps(logger)) + self.assertIsInstance(logger._log, logging.Logger) + self.assertEqual(logger._log.level, logging.ERROR) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_projector.py b/sourced/ml/core/tests/test_projector.py new file mode 100644 index 0000000..77bbe98 --- /dev/null +++ b/sourced/ml/core/tests/test_projector.py @@ -0,0 +1,138 @@ +import json +import os +import shutil +import socket +import tempfile +import time +import unittest + +from modelforge import slogging +import requests + +from sourced.ml.core.tests.test_dump import captured_output +from sourced.ml.core.utils.projector import CORSWebServer, present_embeddings, wait, web_server + + +class ProjectorTests(unittest.TestCase): + MAX_ATTEMPTS = 40 + + @classmethod + def setUpClass(cls): + slogging.setup("DEBUG", False) + + def setUp(self): + self.pwd = os.getcwd() + + def tearDown(self): + os.chdir(self.pwd) + + def wait_for_web_server(self): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + result = -1 + attempts = 0 + while result != 0 and attempts < self.MAX_ATTEMPTS: + time.sleep(0.05) + attempts += 1 + result = sock.connect_ex(("0.0.0.0", 8000)) + return attempts, result + + def test_web_server(self): + with tempfile.TemporaryDirectory(prefix="ml-core-test-") as tmpdir: + os.chdir(tmpdir) + testfile = "test.txt" + with open(testfile, "w") as fout: + fout.write("The Zen of Python, by Tim Peters") + server = CORSWebServer() + server.start() + + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertEqual( + requests.get("http://0.0.0.0:8000/test.txt").text, + "The Zen of Python, by Tim Peters", + ) + finally: + server.stop() + + def test_wait(self): + web_server.start() + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertTrue(web_server.running) + except: # noqa + web_server.stop() + raise + os.environ["PROJECTOR_SERVER_TIME"] = "0" + wait() + self.assertFalse(web_server.running) + web_server.start() + try: + attempts, result = self.wait_for_web_server() + self.assertTrue(attempts < self.MAX_ATTEMPTS or result == 0) + self.assertTrue(web_server.running) + finally: + web_server.stop() + + def test_present_embeddings(self): + with tempfile.TemporaryDirectory(prefix="ml-core-test-") as tmpdir: + tmpdir = os.path.join(tmpdir, "1", "2") + present_embeddings( + tmpdir, + False, + ["one", "two"], + [(str(i), "x") for i in range(5)], + [(i, i) for i in range(5)], + ) + with open(os.path.join(tmpdir, "id2vec.json")) as fin: + json.load(fin) + with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin: + self.assertEqual(fin.read(), "one\ttwo\n0\tx\n1\tx\n2\tx\n3\tx\n4\tx\n") + with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin: + self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n") + + def test_present_embeddings_run_server(self): + def sweded_which(prog): + return None + + which = shutil.which + shutil.which = sweded_which + browser = os.getenv("BROWSER", "") + os.environ["BROWSER"] = "" + + try: + with tempfile.TemporaryDirectory(prefix="ml-core-test-") as tmpdir: + with captured_output() as (stdout, _, _): + present_embeddings( + tmpdir, + True, + ["one"], + [str(i) for i in range(5)], + [(i, i) for i in range(5)], + ) + with open(os.path.join(tmpdir, "id2vec.json")) as fin: + json.load(fin) + with open(os.path.join(tmpdir, "id2vec_meta.tsv")) as fin: + self.assertEqual(fin.read(), "0\n1\n2\n3\n4\n") + with open(os.path.join(tmpdir, "id2vec_data.tsv")) as fin: + self.assertEqual(fin.read(), "0\t0\n1\t1\n2\t2\n3\t3\n4\t4\n") + self.assertIn( + "\thttp://projector.tensorflow.org/?config=http://0.0.0.0:8000/id2vec.json\n", + stdout.getvalue(), + ) + finally: + shutil.which = which + os.environ["BROWSER"] = browser + web_server.stop() + + def test_stop(self): + web_server.stop() # dummy test to avoid partially covered line in CI + self.assertFalse(web_server.running) + web_server.start() + web_server.stop() + self.assertFalse(web_server.running) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_quant.py b/sourced/ml/core/tests/test_quant.py new file mode 100644 index 0000000..b9b32a3 --- /dev/null +++ b/sourced/ml/core/tests/test_quant.py @@ -0,0 +1,57 @@ +from io import BytesIO +import unittest + +import numpy + +from sourced.ml.core.models import QuantizationLevels +import sourced.ml.core.tests.models as paths + + +class QuantizationLevelsTests(unittest.TestCase): + def setUp(self): + self.model = QuantizationLevels().load(source=paths.QUANTLEVELS) + + def test_levels(self): + levels = self.model.levels + self.assertIsInstance(levels, dict) + self.assertEqual(len(levels), 1) + self.assertIsInstance(levels["children"], dict) + self.assertEqual(len(levels["children"]), 259) + + def test_len(self): + self.assertEqual(len(self.model), 1) + + def test_write(self): + levels = { + "xxx": { + "a": numpy.array([1, 2, 3]), + "b": numpy.array([4, 5, 6]), + "c": numpy.array([7, 8, 9]), + }, + "yyy": { + "q": numpy.array([3, 2, 1]), + "w": numpy.array([6, 5, 4]), + "e": numpy.array([9, 8, 7]), + }, + } + buffer = BytesIO() + QuantizationLevels().construct(levels).save(output=buffer, series="quant") + buffer.seek(0) + model = QuantizationLevels().load(buffer) + levels = model.levels + self.assertEqual(len(levels), 2) + self.assertEqual(len(levels["xxx"]), 3) + self.assertEqual(len(levels["yyy"]), 3) + self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all()) + self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all()) + self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all()) + self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all()) + self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all()) + self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all()) + + def test_dump(self): + self.assertEqual(self.model.dump(), "Schemes: [('children', '259@10')]") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_random_walk.py b/sourced/ml/core/tests/test_random_walk.py new file mode 100644 index 0000000..88005af --- /dev/null +++ b/sourced/ml/core/tests/test_random_walk.py @@ -0,0 +1,32 @@ +import unittest + +import bblfsh + +from sourced.ml.core.algorithms.uast.ids_to_bag import FakeVocabulary +from sourced.ml.core.algorithms.uast.struct_to_bag import Uast2RandomWalks +from sourced.ml.core.tests import models + + +class RandomWalkTests(unittest.TestCase): + def setUp(self): + self.bblfsh = bblfsh.BblfshClient("localhost:9432") + self.uast = self.bblfsh.parse(models.SOURCE_PY).uast + self.uast2walk = Uast2RandomWalks( + p_explore_neighborhood=0.5, + q_leave_neighborhood=0.5, + n_walks=5, + n_steps=19, + node2index=FakeVocabulary(), + seed=42, + ) + + def test_rw(self): + for walk in self.uast2walk(self.uast): + for i in range(len(walk) - 1): + self.assertNotEqual( + walk[i], walk[i + 1], "Two neighbours nodes should not be the same" + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_token_parser.py b/sourced/ml/core/tests/test_token_parser.py new file mode 100644 index 0000000..e34634f --- /dev/null +++ b/sourced/ml/core/tests/test_token_parser.py @@ -0,0 +1,178 @@ +import pickle +import unittest + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser, TokenParser + + +class TokenParserTests(unittest.TestCase): + def setUp(self): + self.tp = TokenParser(stem_threshold=4, max_token_length=20) + self.tp._single_shot = False + + def test_process_token(self): + self.tp.max_token_length = 100 + + tokens = [ + ("UpperCamelCase", ["upper", "camel", "case"]), + ("camelCase", ["camel", "case"]), + ("FRAPScase", ["frap", "case"]), + ("SQLThing", ["sqlt", "hing"]), + ("_Astra", ["astra"]), + ("CAPS_CONST", ["caps", "const"]), + ("_something_SILLY_", ["someth", "silli"]), + ("blink182", ["blink"]), + ("FooBar100500Bingo", ["foo", "bar", "bingo"]), + ("Man45var", ["man", "var"]), + ("method_name", ["method", "name"]), + ("Method_Name", ["method", "name"]), + ("101dalms", ["dalm"]), + ("101_dalms", ["dalm"]), + ("101_DalmsBug", ["dalm", "bug"]), + ("101_Dalms45Bug7", ["dalm", "bug"]), + ("wdSize", ["wd", "size", "wdsize"]), + ("Glint", ["glint"]), + ("foo_BAR", ["foo", "bar"]), + ( + "sourced.ml.algorithms.uast_ids_to_bag", + [ + "sourc", + "sourcedml", + "algorithm", + "mlalgorithm", + "uast", + "ids", + "idsto", + "bag", + "tobag", + ], + ), + ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]), + # Another bad example. Parser failed to parse it correctly + ("SmallIdsToFoOo", ["small", "ids", "idsto", "fo", "oo"]), + ("SmallIdFooo", ["small", "smallid", "fooo", "idfooo"]), + ( + "ONE_M0re_.__badId.example", + ["one", "onem", "re", "bad", "rebad", "badid", "exampl", "idexampl"], + ), + ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]), + ("a.b.c.d", ["a", "b", "c", "d"]), + ("A.b.Cd.E", ["a", "b", "cd", "e"]), + ("looong_sh_loooong_sh", ["looong", "looongsh", "loooong", "shloooong", "loooongsh"]), + ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]), + ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]), + ] + + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, correct) + + def test_process_token_single_shot(self): + self.tp.max_token_length = 100 + self.tp._single_shot = True + self.tp.min_split_length = 1 + tokens = [ + ("UpperCamelCase", ["upper", "camel", "case"]), + ("camelCase", ["camel", "case"]), + ("FRAPScase", ["frap", "case"]), + ("SQLThing", ["sqlt", "hing"]), + ("_Astra", ["astra"]), + ("CAPS_CONST", ["caps", "const"]), + ("_something_SILLY_", ["someth", "silli"]), + ("blink182", ["blink"]), + ("FooBar100500Bingo", ["foo", "bar", "bingo"]), + ("Man45var", ["man", "var"]), + ("method_name", ["method", "name"]), + ("Method_Name", ["method", "name"]), + ("101dalms", ["dalm"]), + ("101_dalms", ["dalm"]), + ("101_DalmsBug", ["dalm", "bug"]), + ("101_Dalms45Bug7", ["dalm", "bug"]), + ("wdSize", ["wd", "size"]), + ("Glint", ["glint"]), + ("foo_BAR", ["foo", "bar"]), + ( + "sourced.ml.algorithms.uast_ids_to_bag", + ["sourc", "ml", "algorithm", "uast", "ids", "to", "bag"], + ), + ("WORSTnameYOUcanIMAGINE", ["worst", "name", "you", "can", "imagin"]), + # Another bad example. Parser failed to parse it correctly + ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]), + ("SmallIdFooo", ["small", "id", "fooo"]), + ("ONE_M0re_.__badId.example", ["one", "m", "re", "bad", "id", "exampl"]), + ("never_use_Such__varsableNames", ["never", "use", "such", "varsabl", "name"]), + ("a.b.c.d", ["a", "b", "c", "d"]), + ("A.b.Cd.E", ["a", "b", "cd", "e"]), + ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]), + ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]), + ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]), + ] + + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, correct) + + min_split_length = 3 + self.tp.min_split_length = min_split_length + for token, correct in tokens: + res = list(self.tp.process_token(token)) + self.assertEqual(res, [c for c in correct if len(c) >= min_split_length]) + + def test_split(self): + self.assertEqual(list(self.tp.split("set for")), ["set", "for"]) + self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"]) + self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"]) + self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"]) + self.assertEqual(list(self.tp.split("PrintAllExcept")), ["print", "all", "except"]) + self.assertEqual( + list(self.tp.split("print really long line")), + # 'longli' is expected artifact due to edge effects + ["print", "really", "long", "longli"], + ) + self.assertEqual( + list(self.tp.split("set /for. *&PrintAll")), ["set", "for", "print", "all"] + ) + self.assertEqual(list(self.tp.split("JumpDown not Here")), ["jump", "down", "not", "here"]) + + self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"]) + self.assertEqual( + list(self.tp.split("a b long c d")), ["a", "b", "long", "blong", "longc", "d"] + ) + self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"]) + + def test_split_single_shot(self): + self.tp._single_shot = True + self.tp.min_split_length = 1 + self.assertEqual( + list(self.tp.split("print really long line")), + # 'longli' is expected artifact due to edge effects + ["print", "really", "long", "li"], + ) + self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"]) + self.assertEqual(list(self.tp.split("a b long c d")), ["a", "b", "long", "c", "d"]) + self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"]) + + def test_stem(self): + self.assertEqual(self.tp.stem("lol"), "lol") + self.assertEqual(self.tp.stem("apple"), "appl") + self.assertEqual(self.tp.stem("orange"), "orang") + self.assertEqual(self.tp.stem("embedding"), "embed") + self.assertEqual(self.tp.stem("Alfred"), "Alfred") + self.assertEqual(self.tp.stem("Pluto"), "Pluto") + + def test_pickle(self): + tp = pickle.loads(pickle.dumps(self.tp)) + self.assertEqual(tp.stem("embedding"), "embed") + + +class NoopTokenParserTests(unittest.TestCase): + def setUp(self): + self.tp = NoopTokenParser() + + def test_process_token(self): + self.assertEqual(list(self.tp.process_token("abcdef")), ["abcdef"]) + self.assertEqual(list(self.tp.process_token("abcd_ef")), ["abcd_ef"]) + self.assertEqual(list(self.tp.process_token("abcDef")), ["abcDef"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py new file mode 100644 index 0000000..90e69d1 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_inttypes_to_graphlets.py @@ -0,0 +1,20 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms.uast.inttypes_to_graphlets import Uast2GraphletBag +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2GraphletBagTest(unittest.TestCase): + def setUp(self): + self.graphlets_bag_extractor = Uast2GraphletBag() + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.graphlets_bag_extractor(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_struct_to_bag.py b/sourced/ml/core/tests/test_uast_struct_to_bag.py new file mode 100644 index 0000000..5a888e6 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_struct_to_bag.py @@ -0,0 +1,57 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms.uast.struct_to_bag import UastRandomWalk2Bag, UastSeq2Bag +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2RandomWalk2BagTest(unittest.TestCase): + def setUp(self): + self.uast_random_walk2bag = UastRandomWalk2Bag(seq_len=[2, 3]) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.uast_random_walk2bag(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + def test_equivalence_prepare_starting_nodes(self): + starting_nodes_old = self.prepare_starting_nodes(self.uast) + starting_nodes = self.uast_random_walk2bag.uast2walks.prepare_starting_nodes(self.uast) + self.assertEqual(len(starting_nodes_old), len(starting_nodes)) + + def structure(tree): + from collections import Counter + + return set(Counter(len(node.children) for node in tree)) + + self.assertEqual(structure(starting_nodes_old), structure(starting_nodes)) + + def prepare_starting_nodes(self, uast): + starting_nodes = [] + self._prepare_starting_nodes(uast, None, starting_nodes) + + return starting_nodes + + def _prepare_starting_nodes(self, root, parent, starting_nodes): + node = self.uast_random_walk2bag.uast2walks._extract_node(node=root, parent=parent) + starting_nodes.append(node) + + for ch in root.children: + node.children.append( + self._prepare_starting_nodes(ch, parent=node, starting_nodes=starting_nodes) + ) + + +class UastSeq2BagTest(unittest.TestCase): + def setUp(self): + self.uast_seq2bag = UastSeq2Bag(seq_len=[2, 3]) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_uast_to_bag(self): + bag = self.uast_seq2bag(self.uast) + self.assertGreater(len(bag), 0, "Expected size of bag should be > 0") + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_id_distance.py b/sourced/ml/core/tests/test_uast_to_id_distance.py new file mode 100644 index 0000000..aa1b491 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_id_distance.py @@ -0,0 +1,130 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.id_distance import Uast2IdLineDistance, Uast2IdTreeDistance +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2IdTreeDistanceTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2IdTreeDistance( + token_parser=NoopTokenParser(), max_distance=4 + ) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + self.maxDiff = None + + def test_result(self): + correct = [ + (("__spec__", "ModuleSpec"), 2), + (("__spec__", "ModuleSpec"), 3), + (("__spec__", "ModuleSpec"), 3), + (("collections", "ModuleSpec"), 2), + (("collections", "ModuleSpec"), 2), + (("collections", "ModuleSpec"), 3), + (("collections", "__spec__"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "ModuleSpec"), 3), + (("namedtuple", "collections"), 3), + (("namedtuple", "collections"), 3), + (("setup", "modelforge.logs"), 3), + (("setup_logging", "modelforge.logs"), 3), + (("sys", "modelforge.logs"), 3), + (("sys", "modules"), 2), + (("utmain", "ModuleSpec"), 2), + (("utmain", "ModuleSpec"), 3), + (("utmain", "ModuleSpec"), 3), + (("utmain", "__package__"), 2), + (("utmain", "__spec__"), 2), + (("utmain", "__spec__"), 2), + (("utmain", "collections"), 3), + (("utmain", "modelforge.logs"), 2), + (("utmain", "modelforge.logs"), 2), + (("utmain", "setup"), 3), + (("utmain", "setup"), 3), + (("utmain", "setup_logging"), 3), + (("utmain", "setup_logging"), 3), + (("utmain", "sys"), 3), + (("utmain", "sys"), 3), + ] + + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +class Uast2IdLineDistanceTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2IdLineDistance( + token_parser=NoopTokenParser(), max_distance=3 + ) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + self.maxDiff = None + + def test_result(self): + correct = [ + (("__spec__", "ModuleSpec"), 0), + (("__spec__", "ModuleSpec"), 1), + (("__spec__", "ModuleSpec"), 1), + (("__spec__", "__package__"), 0), + (("collections", "ModuleSpec"), 2), + (("collections", "__package__"), 1), + (("collections", "__spec__"), 1), + (("modules", "__package__"), 1), + (("modules", "__spec__"), 1), + (("modules", "collections"), 2), + (("namedtuple", "ModuleSpec"), 0), + (("namedtuple", "ModuleSpec"), 1), + (("namedtuple", "ModuleSpec"), 2), + (("namedtuple", "ModuleSpec"), 2), + (("namedtuple", "__package__"), 1), + (("namedtuple", "__spec__"), 1), + (("namedtuple", "__spec__"), 1), + (("namedtuple", "collections"), 0), + (("namedtuple", "collections"), 2), + (("namedtuple", "modules"), 2), + (("setup_logging", "modelforge.logs"), 0), + (("setup_logging", "setup"), 1), + (("sys", "__package__"), 1), + (("sys", "__spec__"), 1), + (("sys", "collections"), 2), + (("sys", "modelforge.logs"), 2), + (("sys", "modules"), 0), + (("sys", "namedtuple"), 2), + (("sys", "setup_logging"), 2), + (("utmain", "ModuleSpec"), 0), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 1), + (("utmain", "ModuleSpec"), 2), + (("utmain", "__package__"), 0), + (("utmain", "__package__"), 0), + (("utmain", "__package__"), 1), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 0), + (("utmain", "__spec__"), 1), + (("utmain", "__spec__"), 2), + (("utmain", "collections"), 1), + (("utmain", "collections"), 1), + (("utmain", "collections"), 2), + (("utmain", "modules"), 0), + (("utmain", "modules"), 1), + (("utmain", "modules"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 1), + (("utmain", "namedtuple"), 2), + (("utmain", "sys"), 0), + (("utmain", "sys"), 1), + (("utmain", "sys"), 1), + ] + + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_id_sequence.py b/sourced/ml/core/tests/test_uast_to_id_sequence.py new file mode 100644 index 0000000..f2314a6 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_id_sequence.py @@ -0,0 +1,44 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.to_id_sequence import Uast2IdSequence +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2IdSequenceTest(unittest.TestCase): + def setUp(self): + self.uast2id_sequence = Uast2IdSequence(token_parser=NoopTokenParser()) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_result(self): + correct = [ + "sys", + "setup_logging", + "modelforge.logs", + "utmain", + "modules", + "sys", + "__package__", + "utmain", + "__spec__", + "utmain", + "namedtuple", + "collections", + "ModuleSpec", + "namedtuple", + "__spec__", + "utmain", + "ModuleSpec", + "ModuleSpec", + "utmain", + "setup", + "setup_logging", + ] + res = self.uast2id_sequence(self.uast) + self.assertEqual(res, self.uast2id_sequence.concat(correct)) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/tests/test_uast_to_role_id_pairs.py b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py new file mode 100644 index 0000000..ac15214 --- /dev/null +++ b/sourced/ml/core/tests/test_uast_to_role_id_pairs.py @@ -0,0 +1,44 @@ +import unittest + +from bblfsh import BblfshClient + +from sourced.ml.core.algorithms.token_parser import NoopTokenParser +from sourced.ml.core.algorithms.uast.to_role_id_pairs import Uast2RoleIdPairs +from sourced.ml.core.tests.models import SOURCE_PY + + +class Uast2NodesBagTest(unittest.TestCase): + def setUp(self): + self.uast2role_id_pairs = Uast2RoleIdPairs(token_parser=NoopTokenParser()) + self.uast = BblfshClient("0.0.0.0:9432").parse(SOURCE_PY).uast + + def test_result(self): + correct = [ + ("ModuleSpec", "BODY | IF | THEN"), + ("ModuleSpec", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("ModuleSpec", "STATEMENT | INCOMPLETE"), + ("__package__", "BINARY | EXPRESSION | CONDITION"), + ("__spec__", "BINARY | EXPRESSION | CONDITION"), + ("__spec__", "BODY | IF | THEN"), + ("collections", "IDENTIFIER | IMPORT | PATHNAME"), + ("modelforge.logs", "IDENTIFIER | IMPORT | PATHNAME"), + ("modules", "RIGHT | EXPRESSION | INCOMPLETE"), + ("namedtuple", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("namedtuple", "IDENTIFIER | IMPORT | PATHNAME"), + ("setup", "IDENTIFIER | DECLARATION | FUNCTION | NAME"), + ("setup_logging", "IDENTIFIER | EXPRESSION | CALL | CALLEE"), + ("setup_logging", "IDENTIFIER | IMPORT | PATHNAME"), + ("sys", "IDENTIFIER | IMPORT | PATHNAME"), + ("sys", "RIGHT | EXPRESSION | INCOMPLETE"), + ("utmain", "BINARY | EXPRESSION | CONDITION"), + ("utmain", "BINARY | EXPRESSION | CONDITION"), + ("utmain", "BODY | IF | THEN"), + ("utmain", "FILE | MODULE"), + ("utmain", "STATEMENT | INCOMPLETE"), + ] + res = sorted(self.uast2role_id_pairs(self.uast)) + self.assertEqual(res, correct) + + +if __name__ == "__main__": + unittest.main() diff --git a/sourced/ml/core/utils/__init__.py b/sourced/ml/core/utils/__init__.py new file mode 100644 index 0000000..41754f0 --- /dev/null +++ b/sourced/ml/core/utils/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from sourced.ml.core.utils.bigartm import install_bigartm +from sourced.ml.core.utils.pickleable_logger import PickleableLogger diff --git a/sourced/ml/core/utils/bblfsh.py b/sourced/ml/core/utils/bblfsh.py new file mode 100644 index 0000000..12b111a --- /dev/null +++ b/sourced/ml/core/utils/bblfsh.py @@ -0,0 +1,19 @@ +from distutils.version import StrictVersion + +from bblfsh.client import BblfshClient + +BBLFSH_VERSION_LOW = "2.2" +BBLFSH_VERSION_HIGH = "3.0" + + +def check_version(host: str = "0.0.0.0", port: str = "9432") -> bool: + """ + Check if the bblfsh server version matches module requirements. + + :param host: bblfsh server host + :param port: bblfsh server port + :return: True if bblfsh version specified matches requirements + """ + # get version and remove leading 'v' + version = StrictVersion(BblfshClient("%s:%s" % (host, port)).version().version.lstrip("v")) + return StrictVersion(BBLFSH_VERSION_LOW) <= version < StrictVersion(BBLFSH_VERSION_HIGH) diff --git a/sourced/ml/core/utils/bblfsh_roles.py b/sourced/ml/core/utils/bblfsh_roles.py new file mode 100644 index 0000000..22f8569 --- /dev/null +++ b/sourced/ml/core/utils/bblfsh_roles.py @@ -0,0 +1,14 @@ +import bblfsh + + +IDENTIFIER = bblfsh.role_id("IDENTIFIER") +QUALIFIED = bblfsh.role_id("QUALIFIED") +LITERAL = bblfsh.role_id("LITERAL") +OPERATOR = bblfsh.role_id("OPERATOR") +EXPRESSION = bblfsh.role_id("EXPRESSION") +LEFT = bblfsh.role_id("LEFT") +BINARY = bblfsh.role_id("BINARY") +ASSIGNMENT = bblfsh.role_id("ASSIGNMENT") +FUNCTION = bblfsh.role_id("FUNCTION") +DECLARATION = bblfsh.role_id("DECLARATION") +NAME = bblfsh.role_id("NAME") diff --git a/sourced/ml/core/utils/bigartm.py b/sourced/ml/core/utils/bigartm.py new file mode 100644 index 0000000..7567100 --- /dev/null +++ b/sourced/ml/core/utils/bigartm.py @@ -0,0 +1,60 @@ +import glob +import logging +import multiprocessing +import os +import shutil +import subprocess +import tempfile + + +def execute(cmd, cwd, log): + log.info(">>> %s", cmd) + parsed = [v for v in cmd.split(" ") if v] + subprocess.check_call(parsed, cwd=cwd) + + +def install_bigartm(args=None, target="./bigartm", tempdir=None): + """ + Deploys bigartm/bigartm at the specified path. + + :param args: :class:`argparse.Namespace` with "output" and "tmpdir". \ + "output" sets the target directory, "tmpdir" sets \ + the temporary directory which is used to clone bigartm/bigartm \ + and build it. + :param target: The path to the built executable. If args is not None, it \ + becomes overridden. + :param tempdir: The temporary directory where to clone and build \ + bigartm/bigartm. If args is not None, it becomes overridden. + :return: None if successful; otherwise, the error code (can be 0!). + """ + log = logging.getLogger("bigartm") + if args is not None: + tempdir = args.tmpdir + target = os.path.join(args.output, "bigartm") + if shutil.which(os.path.basename(target)) or shutil.which(target, path=os.getcwd()): + log.warning("bigartm is in the PATH, no-op.") + return 0 + if not shutil.which("cmake"): + log.error("You need to install cmake.") + return 1 + parent_dir = os.path.dirname(target) + os.makedirs(parent_dir, exist_ok=True) + if not os.path.isdir(parent_dir): + log.error("%s is not a directory.", parent_dir) + return 2 + with tempfile.TemporaryDirectory(prefix="bigartm-", dir=tempdir) as tmpdir: + log.info("Building bigartm/bigartm in %s...", tmpdir) + execute( + "git clone --single-branch --depth=1 https://github.com/bigartm/bigartm .", tmpdir, log + ) + cwd = os.path.join(tmpdir, "build") + os.mkdir(cwd) + execute( + "cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF -DPYTHON=python3 ..", cwd, log + ) + execute("make -j%d" % multiprocessing.cpu_count(), cwd, log) + whl_path = glob.glob(os.path.join(tmpdir, "build/python/*.whl"))[0] + execute('pip3 install "%s"' % whl_path, cwd, log) + shutil.copyfile(os.path.join(cwd, "bin", "bigartm"), target) + os.chmod(target, 0o777) + log.info("Installed %s", os.path.abspath(target)) diff --git a/sourced/ml/core/utils/pickleable_logger.py b/sourced/ml/core/utils/pickleable_logger.py new file mode 100644 index 0000000..31d5bd0 --- /dev/null +++ b/sourced/ml/core/utils/pickleable_logger.py @@ -0,0 +1,35 @@ +import logging + + +class PickleableLogger: + """ + Base class which provides the logging features through ``self._log``. + + Can be safely pickled. + """ + + def __init__(self, log_level=logging.INFO): + """ + Class constructor + + :param log_level: logging level. + """ + self._log = logging.getLogger(self._get_log_name()) + self._log.setLevel(log_level) + + def __getstate__(self): + state = self.__dict__.copy() + state["_log"] = self._log.level + return state + + def __setstate__(self, state): + self.__dict__.update(state) + log_level = state["_log"] + self._log = logging.getLogger(self._get_log_name()) + self._log.setLevel(log_level) + + def _get_log_name(self): + """ + Children must implement this method. It shall return the logger's name. + """ + raise NotImplementedError diff --git a/sourced/ml/core/utils/projector.py b/sourced/ml/core/utils/projector.py new file mode 100644 index 0000000..273ee32 --- /dev/null +++ b/sourced/ml/core/utils/projector.py @@ -0,0 +1,111 @@ +from http.server import HTTPServer, SimpleHTTPRequestHandler, test +import logging +import os +import shutil +import threading +import time + + +class CORSWebServer: + def __init__(self): + self.thread = None + self.server = None + + def serve(self): + outer = self + + class ClojureServer(HTTPServer): + def __init__(self, *args, **kwargs): + HTTPServer.__init__(self, *args, **kwargs) + outer.server = self + + class CORSRequestHandler(SimpleHTTPRequestHandler): + def end_headers(self): + self.send_header("Access-Control-Allow-Origin", "*") + SimpleHTTPRequestHandler.end_headers(self) + + test(CORSRequestHandler, ClojureServer) + + def start(self): + self.thread = threading.Thread(target=self.serve) + self.thread.start() + + def stop(self): + if self.running: + self.server.shutdown() + self.server.server_close() + self.thread.join() + self.server = None + self.thread = None + + @property + def running(self): + return self.server is not None + + +web_server = CORSWebServer() + + +def present_embeddings(destdir, run_server, labels, index, embeddings): + log = logging.getLogger("projector") + log.info("Writing Tensorflow Projector files...") + if not os.path.isdir(destdir): + os.makedirs(destdir) + os.chdir(destdir) + metaf = "id2vec_meta.tsv" + with open(metaf, "w") as fout: + if len(labels) > 1: + fout.write("\t".join(labels) + "\n") + for item in index: + if len(labels) > 1: + fout.write("\t".join(item) + "\n") + else: + fout.write(item + "\n") + log.info("Wrote %s", metaf) + dataf = "id2vec_data.tsv" + with open(dataf, "w") as fout: + for vec in embeddings: + fout.write("\t".join(str(v) for v in vec)) + fout.write("\n") + log.info("Wrote %s", dataf) + jsonf = "id2vec.json" + with open(jsonf, "w") as fout: + fout.write( + """{ + "embeddings": [ + { + "tensorName": "id2vec", + "tensorShape": [%s, %s], + "tensorPath": "http://0.0.0.0:8000/%s", + "metadataPath": "http://0.0.0.0:8000/%s" + } + ] +} +""" + % (len(embeddings), len(embeddings[0]), dataf, metaf) + ) + log.info("Wrote %s", jsonf) + if run_server and not web_server.running: + web_server.start() + url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf + log.info(url) + if run_server: + if shutil.which("xdg-open") is not None: + os.system("xdg-open " + url) + else: + browser = os.getenv("BROWSER", "") + if browser: + os.system(browser + " " + url) + else: + print("\t" + url) + + +def wait(): + log = logging.getLogger("projector") + secs = int(os.getenv("PROJECTOR_SERVER_TIME", "60")) + log.info("Sleeping for %d seconds, safe to Ctrl-C" % secs) + try: + time.sleep(secs) + except KeyboardInterrupt: + pass + web_server.stop()