#!/usr/bin/env python
# -*- coding: ASCII -*-

'''
(c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.

This file describes how to train a CADD model from the provided training data.
This script needs about 300 GB of RAM to run properly and will fail to execute
on any other machine.

Please note that we do not provide the training matrix file directly but that
it can be generated simply by concatenating the four .csv.gz files in the 
respective training directories:

zcat *.csv.gz | gzip -c > train.csv.gz
'''

### to run the script, the user needs to have python with numpy and sklearn installed
import numpy as np
from sklearn.linear_model import LogisticRegression
import sklearn.preprocessing
from sklearn.externals import joblib

### File names, adjust as needed
TRAINING_MATRIX = 'train.csv.gz'
OUTPUT_MODEL_FILE = 'CADD.mod'

# load training data
mat = np.loadtxt(TRAINING_MATRIX,delimiter=",")
Y = mat[:,0].reshape((mat.shape[0],))
X = mat[:,1:]

# scaling the training data to unified variance
scaler = sklearn.preprocessing.StandardScaler(with_mean=False, copy=False)
scaler.fit(X)

# train the model
clf = LogisticRegression(penalty='l2',
                         C=1,
                         max_iter=13,
                         solver='lbfgs',
                         warm_start=True,
                         verbose=3)
clf.fit(X, Y)

# store scaler and model
joblib.dump((clf, scaler), OUTPUT_MODEL_FILE, 3)

'''
The generated model file can be used with the CADD scripts as available at
github.com/kirchlab/CADD-scripts

Note that the generated models coefficients will not be 100% identical to our
CADD models. This happens because we train our models with a random 1% hold-out
test set. We further train our models not in a single call of 13 iteration
convergence but step-wise one iteration after the other in order to save,
evaluate and compare different models.
'''