#!/usr/bin/env python # -*- coding: ASCII -*- ''' (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved. This file describes how to train a CADD model from the provided training data. This script needs about 300 GB of RAM to run properly and will fail to execute on any other machine. Please note that we do not provide the training matrix file directly but that it can be generated simply by concatenating the four .csv.gz files in the respective training directories: zcat *.csv.gz | gzip -c > train.csv.gz ''' ### to run the script, the user needs to have python with numpy and sklearn installed import numpy as np from sklearn.linear_model import LogisticRegression import sklearn.preprocessing from sklearn.externals import joblib ### File names, adjust as needed TRAINING_MATRIX = 'train.csv.gz' OUTPUT_MODEL_FILE = 'CADD.mod' # load training data mat = np.loadtxt(TRAINING_MATRIX,delimiter=",") Y = mat[:,0].reshape((mat.shape[0],)) X = mat[:,1:] # scaling the training data to unified variance scaler = sklearn.preprocessing.StandardScaler(with_mean=False, copy=False) scaler.fit(X) # train the model clf = LogisticRegression(penalty='l2', C=1, max_iter=13, solver='lbfgs', warm_start=True, verbose=3) clf.fit(X, Y) # store scaler and model joblib.dump((clf, scaler), OUTPUT_MODEL_FILE, 3) ''' The generated model file can be used with the CADD scripts as available at github.com/kirchlab/CADD-scripts Note that the generated models coefficients will not be 100% identical to our CADD models. This happens because we train our models with a random 1% hold-out test set. We further train our models not in a single call of 13 iteration convergence but step-wise one iteration after the other in order to save, evaluate and compare different models. '''