Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

[run]
branch = True
source = sourced/ml/core

[report]
exclude_lines =
no cover
raise NotImplementedError
if __name__ == "__main__":
ignore_errors = True
omit =
sourced/ml/core/tests/*
sourced/ml/core/swivel.py
sourced/ml/core/bigartm.py
17 changes: 17 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[flake8]
ignore=B008,E121,E123,E126,E203,E226,E24,E704,W503,W504,D100,D105,D200,D301,D402
max-line-length=99
exclude=
.git
doc
inline-quotes="
import-order-style=appnexus
application-package-names=sourced.ml.core
per-file-ignores=
**/tests/**:D
# Should be resolved one by one
# Related issue: https://github.com/src-d/ml/issues/354
./sourced/ml/core/extractors/*:D
./sourced/ml/core/models/**:D
./sourced/ml/core/algorithms/**:D
./sourced/ml/core/utils/*:D
116 changes: 116 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@

#Mac OS
*.DS_Store

#PyCharm IDE
.idea/

# Documentation build files
doc/_build/
doc/ast2vec.rst
doc/modules.rst

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# dotenv
.env

# virtualenv
.venv
venv/
ENV/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# CI
.ci
10 changes: 10 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[MASTER]
jobs=0
load-plugins=pylint.extensions.docparams

[MESSAGES CONTROL]
disable=all
enable=missing-param-doc,
differing-param-doc,
differing-type-doc,
missing-return-doc
54 changes: 54 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
language: python
sudo: true
dist: xenial
git:
depth: 9999999
services:
- docker
cache: pip
before_cache:
- chown -R travis:travis $HOME/.cache/pip
stages:
- style
- test
_install: &_install
- travis_retry make bblfsh-start
- pip install --upgrade pip cython codecov
- ML_CORE_SETUP_INCLUDE_TESTS=1 pip install .[tf]
- cd $(pip show sourced.ml.core|grep Location|cut -d' ' -f2)/sourced/ml/core
- find . -wholename "*/tests/*" -type d -exec chmod 555 {} \;
_coverage: &_coverage
- coverage run --concurrency=multiprocessing -m unittest discover
- travis_retry coverage combine
matrix:
fast_finish: true
include:
- stage: style
python: 3.7
script:
- make check
install:
- pip install -r requirements-lint.txt
- stage: test
python: 3.5
script: *_coverage
install: *_install
- stage: test
python: 3.6
script: *_coverage
install: *_install
- stage: test
python: 3.7
script: *_coverage
install: *_install
after_success:
- codecov
- stage: test
name: Tests inside docker
script:
- make docker-build VERSION=test
- make docker-test VERSION=test
install:
- travis_retry make bblfsh-start
notifications:
email: false
30 changes: 30 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
FROM ubuntu:18.04

ENV BROWSER=/browser \
LC_ALL=en_US.UTF-8

COPY requirements.txt ml_core/requirements.txt

RUN apt-get update && \
apt-get install -y --no-install-suggests --no-install-recommends \
ca-certificates locales libxml2 libxml2-dev gcc g++ wget \
python3 python3-dev python3-distutils && \
echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \
locale-gen && \
wget -O - https://bootstrap.pypa.io/get-pip.py | python3 && \
cd ml_core && \
pip3 install --no-cache-dir -r requirements.txt && \
apt-get remove -y python3-dev libxml2-dev gcc g++ wget && \
apt-get remove -y .*-doc .*-man >/dev/null && \
apt-get autoremove -y && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* && \
echo '#!/bin/bash\n\
\n\
echo\n\
echo " $@"\n\
echo\n\' > /browser && \
chmod +x /browser

COPY . ml_core/
RUN cd ml_core && pip3 install -e .
38 changes: 38 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
current_dir = $(shell pwd)

PROJECT = ml_core

DOCKERFILES = Dockerfile:$(PROJECT)
DOCKER_ORG = "srcd"

# Including ci Makefile
CI_REPOSITORY ?= https://github.com/src-d/ci.git
CI_BRANCH ?= v1
CI_PATH ?= .ci
MAKEFILE := $(CI_PATH)/Makefile.main
$(MAKEFILE):
git clone --quiet --depth 1 -b $(CI_BRANCH) $(CI_REPOSITORY) $(CI_PATH);
-include $(MAKEFILE)

.PHONY: check
check:
! (grep -R /tmp sourced/ml/core/tests)
flake8 --count
pylint sourced

.PHONY: test
test:
python3 -m unittest discover

.PHONY: docker-test
docker-test:
docker ps | grep bblfshd # bblfsh server should be run. Try `make bblfsh-start` command.
docker run --rm -it --network host --entrypoint python3 -w /ml_core \
-e SKIP_BBLFSH_UTILS_TESTS=1 \
srcd/ml_core:$(VERSION) -m unittest discover

.PHONY: bblfsh-start
bblfsh-start:
! docker ps | grep bblfshd # bblfsh server should not be running already
docker run -d --name ml_core_bblfshd --privileged -p 9432\:9432 bblfsh/bblfshd\:v2.12.1
docker exec -it ml_core_bblfshd bblfshctl driver install python bblfsh/python-driver\:v2.9.0
100 changes: 6 additions & 94 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,95 +1,7 @@
# MLonCode research playground [![PyPI](https://img.shields.io/pypi/v/sourced-ml.svg)](https://pypi.python.org/pypi/sourced-ml) [![Build Status](https://travis-ci.org/src-d/ml.svg)](https://travis-ci.org/src-d/ml) [![Docker Build Status](https://img.shields.io/docker/build/srcd/ml.svg)](https://hub.docker.com/r/srcd/ml) [![codecov](https://codecov.io/github/src-d/ml/coverage.svg)](https://codecov.io/gh/src-d/ml)
# MLonCode Core Library
[![Build Status](https://travis-ci.org/src-d/ml-core.svg)](https://travis-ci.org/src-d/ml-core)
[![codecov](https://codecov.io/github/src-d/ml-core/coverage.svg)](https://codecov.io/gh/src-d/ml-core)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/ambv/black)

This project is the foundation for [MLonCode](https://github.com/src-d/awesome-machine-learning-on-source-code) research and development. It abstracts feature extraction and training models, thus allowing to focus on the higher level tasks.

Currently, the following models are implemented:

* BOW - weighted bag of x, where x is many different extracted feature types.
* id2vec, source code identifier embeddings.
* docfreq, feature document frequencies \(part of TF-IDF\).
* topic modeling over source code identifiers.

It is written in Python3 and has been tested on Linux and macOS. source{d} core-ml is tightly
coupled with [source{d} engine](https://engine.sourced.tech) and delegates all the feature extraction parallelization to it.

Here is the list of proof-of-concept projects which are built using ml-core:

* [vecino](https://github.com/src-d/vecino) - finding similar repositories.
* [tmsc](https://github.com/src-d/tmsc) - listing topics of a repository.
* [snippet-ranger](https://github.com/src-d/snippet-ranger) - topic modeling of source code snippets.
* [apollo](https://github.com/src-d/apollo) - source code deduplication at scale.

## Installation

Whether you wish to include Spark in your installation or would rather use an existing
installation, to use `sourced-ml` you will need to have some native libraries installed,
e.g. on Ubuntu you must first run: `apt install libxml2-dev libsnappy-dev`. [Tensorflow](https://tensorflow.org)
is also a requirement - we support both the CPU and GPU version.
In order to select which version you want, modify the package name in the next section
to either `sourced-ml[tf]` or `sourced-ml[tf-gpu]` depending on your choice.
**If you don't, neither version will be installed.**

## Docker image

```text
docker run -it --rm srcd/ml --help
```

If this first command fails with

```text
Cannot connect to the Docker daemon. Is the docker daemon running on this host?
```

And you are sure that the daemon is running, then you need to add your user to `docker` group: refer to the [documentation](https://docs.docker.com/engine/installation/linux/linux-postinstall/#manage-docker-as-a-non-root-user).

## Contributions

...are welcome! See [CONTRIBUTING](contributing.md) and [CODE\_OF\_CONDUCT.md](code_of_conduct.md).

## License

[Apache 2.0](license.md)

## Algorithms

#### Identifier embeddings

We build the source code identifier co-occurrence matrix for every repository.

1. Read Git repositories.
2. Classify files using [enry](https://github.com/src-d/enry).
3. Extract [UAST](https://doc.bblf.sh/uast/specification.html) from each supported file.
4. [Split and stem](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/algorithms/token_parser.py) all the identifiers in each tree.
5. [Traverse UAST](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/transformers/coocc.py), collapse all non-identifier paths and record all
identifiers on the same level as co-occurring. Besides, connect them with their immediate parents.

6. Write the global co-occurrence matrix.
7. Train the embeddings using [Swivel](https://github.com/src-d/ml/tree/d1f13d079f57caa6338bb7eb8acb9062e011eda9/sourced/ml/algorithms/swivel.py) \(requires Tensorflow\). Interactively view
the intermediate results in Tensorboard using `--logs`.

8. Write the identifier embeddings model.

1-5 is performed with `repos2coocc` command, 6 with `id2vec_preproc`, 7 with `id2vec_train`, 8 with `id2vec_postproc`.

#### Weighted Bag of X

We represent every repository as a weighted bag-of-vectors, provided by we've got document frequencies \("docfreq"\) and identifier embeddings \("id2vec"\).

1. Clone or read the repository from disk.
2. Classify files using [enry](https://github.com/src-d/enry).
3. Extract [UAST](https://doc.bblf.sh/uast/specification.html) from each supported file.
4. Extract various features from each tree, e.g. identifiers, literals or node2vec-like structural fingerprints.
5. Group by repository, file or function.
6. Set the weight of each such feature according to TF-IDF.
7. Write the BOW model.

1-7 are performed with `repos2bow` command.

#### Topic modeling

See [here](doc/topic_modeling.md).

## Glossary

See [here](GLOSSARY.md).
Library for machine learning on source code. Provides commonly used algorithms and tools
to process the code-related data, such as: Babelfish's UASTs, plain code text, etc.
Loading