How to use indigopy

Example code for how to use the indigopy package.

Set up environment

[1]:
# Import dependencies
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import r2_score, classification_report
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import plotly.graph_objects as go

# Import package functions
import sys
sys.path.append('c:/Users/User/github/INDIGOpy/') # modify if testing locally in different machine; remove once package is published
from indigopy.core import load_sample, featurize, classify

Example: E. coli

[2]:
# Load sample data
sample = load_sample('ecoli')

# Define input arguments
key             = sample['key']
profiles        = sample['profiles']
feature_names   = sample['feature_names']
train_ixns      = sample['train']['interactions']
train_scores    = sample['train']['scores']
test_ixns       = sample['test']['interactions']
test_scores     = sample['test']['scores']

# Determine ML features
train_data      = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data       = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()

[5]:
print(test_data['feature_df'])
                        AMK + FUS  AMK + RIF  AMK + SPE  AMK + VAN  CEF + FUS  \
sigma-neg-ECK1963-HCHA        0.0        0.0        0.0        1.0        0.0
sigma-neg-ECK1488-PQQL        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK0813-YBIW        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK2456-EUTP        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK3716-BGLG        0.0        0.0        0.0        0.0        0.0
...                           ...        ...        ...        ...        ...
delta-pos-ECK2585-KGTP        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK1179-CVRA        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3002-YQHC        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3103-TDCE        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3742-RBSD        0.0        0.0        0.0        0.0        0.0

                        CEF + RIF  CEF + SPE  CEF + VAN  CHL + FUS  CHL + RIF  \
sigma-neg-ECK1963-HCHA        0.0        0.0        1.0        0.0        0.0
sigma-neg-ECK1488-PQQL        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK0813-YBIW        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK2456-EUTP        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK3716-BGLG        0.0        0.0        0.0        0.0        0.0
...                           ...        ...        ...        ...        ...
delta-pos-ECK2585-KGTP        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK1179-CVRA        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3002-YQHC        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3103-TDCE        0.0        0.0        0.0        1.0        1.0
delta-pos-ECK3742-RBSD        0.0        0.0        0.0        1.0        1.0

                        ...  RIF + TOB  RIF + TMP  RIF + VAN  SPE + TET  \
sigma-neg-ECK1963-HCHA  ...        0.0        0.0        1.0        0.0
sigma-neg-ECK1488-PQQL  ...        0.0        0.0        0.0        0.0
sigma-neg-ECK0813-YBIW  ...        0.0        1.0        0.0        0.0
sigma-neg-ECK2456-EUTP  ...        0.0        0.0        0.0        0.0
sigma-neg-ECK3716-BGLG  ...        0.0        0.0        0.0        0.0
...                     ...        ...        ...        ...        ...
delta-pos-ECK2585-KGTP  ...        0.0        0.0        0.0        0.0
delta-pos-ECK1179-CVRA  ...        0.0        0.0        0.0        0.0
delta-pos-ECK3002-YQHC  ...        0.0        0.0        0.0        1.0
delta-pos-ECK3103-TDCE  ...        0.0        0.0        0.0        0.0
delta-pos-ECK3742-RBSD  ...        0.0        0.0        0.0        0.0

                        SPE + TOB  SPE + TMP  SPE + VAN  TET + VAN  TOB + VAN  \
sigma-neg-ECK1963-HCHA        0.0        0.0        1.0        1.0        1.0
sigma-neg-ECK1488-PQQL        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK0813-YBIW        0.0        1.0        0.0        0.0        0.0
sigma-neg-ECK2456-EUTP        0.0        0.0        0.0        0.0        0.0
sigma-neg-ECK3716-BGLG        0.0        0.0        0.0        0.0        0.0
...                           ...        ...        ...        ...        ...
delta-pos-ECK2585-KGTP        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK1179-CVRA        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3002-YQHC        0.0        0.0        0.0        1.0        0.0
delta-pos-ECK3103-TDCE        0.0        0.0        0.0        0.0        0.0
delta-pos-ECK3742-RBSD        0.0        0.0        0.0        0.0        0.0

                        TMP + VAN
sigma-neg-ECK1963-HCHA        1.0
sigma-neg-ECK1488-PQQL        0.0
sigma-neg-ECK0813-YBIW        1.0
sigma-neg-ECK2456-EUTP        0.0
sigma-neg-ECK3716-BGLG        0.0
...                           ...
delta-pos-ECK2585-KGTP        0.0
delta-pos-ECK1179-CVRA        0.0
delta-pos-ECK3002-YQHC        0.0
delta-pos-ECK3103-TDCE        0.0
delta-pos-ECK3742-RBSD        0.0

[15916 rows x 66 columns]
[ ]:

# Determine class labels thresh, classes = (-0.5, 2), ('S', 'N', 'A') train_labels = classify(train_scores, thresholds=thresh, classes=classes) test_labels = classify(test_scores, thresholds=thresh, classes=classes) # Train and apply a regression-based model reg_model = RandomForestRegressor() reg_model.fit(X_train, train_scores) reg_y = reg_model.predict(X_test) r, p = spearmanr(test_scores, reg_y) r2 = r2_score(test_scores, reg_y) print('Regression results:') print('\tSpearman R = {}'.format(round(r, 4))) print('\tSpearman p = {:.3g}'.format(p)) print('\tR2 = {}'.format(round(r2, 4))) # Train and apply a classification-based model class_model = RandomForestClassifier() class_model.fit(X_train, train_labels) class_y = class_model.predict(X_test) print('Classification results:') print(classification_report(test_labels, class_y))

Example: M. tuberculosis

[3]:
# Load sample data
sample = load_sample('mtb')

# Define input arguments
key             = sample['key']
profiles        = sample['profiles']
feature_names   = sample['feature_names']
train_ixns      = sample['train']['interactions']
train_scores    = sample['train']['scores']
test_ixns       = sample['test']['interactions']
test_scores     = sample['test']['scores']
clinical_ixns   = sample['clinical']['interactions']
clinical_scores = sample['clinical']['scores']

# Determine ML features
train_data      = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data       = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
clinical_data   = featurize(clinical_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()
X_clinical      = clinical_data['feature_df'].to_numpy().transpose()

# Determine class labels
thresh, classes = (0.9, 1.1), ('S', 'N', 'A')
train_labels    = classify(train_scores, thresholds=thresh, classes=classes)
test_labels     = classify(test_scores, thresholds=thresh, classes=classes)

# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))

# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))

# Apply model to clinical data
clinical_y = reg_model.predict(X_clinical)
r, p = spearmanr(clinical_scores, clinical_y)
print('Clinical results:')
print('\tSpearman R = {}'.format(round(-r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
Regression results:
        Spearman R = 0.5883
        Spearman p = 0.000161
        R2 = 0.1507
Classification results:
              precision    recall  f1-score   support

           A       0.67      0.38      0.48        16
           N       0.00      0.00      0.00         1
           S       0.73      0.84      0.78        19

    accuracy                           0.61        36
   macro avg       0.46      0.41      0.42        36
weighted avg       0.68      0.61      0.63        36

Clinical results:
        Spearman R = 0.5607
        Spearman p = 5.74e-06

Example: S. aureus

[4]:
# Load sample data
sample = load_sample('saureus')

# Define input arguments
key             = sample['key']
profiles        = sample['profiles']
feature_names   = sample['feature_names']
train_ixns      = sample['train']['interactions']
train_scores    = sample['train']['scores']
test_ixns       = sample['test']['interactions']
test_scores     = sample['test']['scores']
strains         = sample['orthology']['strains']
orthology_map   = sample['orthology']['map']

# Determine ML features
train_data      = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data       = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()

# Determine class labels
thresh, classes = (-0.5, 2), ('S', 'N', 'A')
train_labels    = classify(train_scores, thresholds=thresh, classes=classes)
test_labels     = classify(test_scores, thresholds=thresh, classes=classes)

# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))

# Orthology version
test_data_ortho = featurize(test_ixns, profiles, feature_names=feature_names, key=key,
                            strains=strains, orthology_map=orthology_map, silent=True)
X_test_ortho    = test_data_ortho['feature_df'].to_numpy().transpose()
reg_y_ortho = reg_model.predict(X_test_ortho)
r, p = spearmanr(test_scores, reg_y_ortho)
r2 = r2_score(test_scores, reg_y_ortho)
print('Regression results (with orthology):')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))

# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))
Regression results:
        Spearman R = 0.478
        Spearman p = 0.000898
        R2 = -0.3188
Regression results (with orthology):
        Spearman R = 0.5756
        Spearman p = 3.52e-05
        R2 = -1.2016
Classification results:
              precision    recall  f1-score   support

           A       0.33      0.50      0.40         2
           N       0.50      0.77      0.61        22
           S       0.62      0.24      0.34        21

    accuracy                           0.51        45
   macro avg       0.49      0.50      0.45        45
weighted avg       0.55      0.51      0.48        45

Example: A. baumannii

[3]:
# Load sample data
sample = load_sample('abaumannii')

# Define input arguments
key             = sample['key']
profiles        = sample['profiles']
feature_names   = sample['feature_names']
train_ixns      = sample['train']['interactions']
train_scores    = sample['train']['scores']
test_ixns       = sample['test']['interactions']
test_scores     = sample['test']['scores']
strains         = sample['orthology']['strains']
orthology_map   = sample['orthology']['map']

# Determine ML features
train_data      = featurize(train_ixns, profiles, feature_names=feature_names, key=key, silent=True)
test_data       = featurize(test_ixns, profiles, feature_names=feature_names, key=key, silent=True)
X_train, X_test = train_data['feature_df'].to_numpy().transpose(), test_data['feature_df'].to_numpy().transpose()

# Determine class labels
thresh, classes = (-0.5, 0), ('S', 'N', 'A')
train_labels    = classify(train_scores, thresholds=thresh, classes=classes)
test_labels     = classify(test_scores, thresholds=thresh, classes=classes)

# Train and apply a regression-based model
reg_model = RandomForestRegressor()
reg_model.fit(X_train, train_scores)
reg_y = reg_model.predict(X_test)
r, p = spearmanr(test_scores, reg_y)
r2 = r2_score(test_scores, reg_y)
print('Regression results:')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))

# Orthology version
test_data_ortho = featurize(test_ixns, profiles, feature_names=feature_names, key=key,
                            strains=strains, orthology_map=orthology_map, silent=True)
X_test_ortho    = test_data_ortho['feature_df'].to_numpy().transpose()
reg_y_ortho = reg_model.predict(X_test_ortho)
r, p = spearmanr(test_scores, reg_y_ortho)
r2 = r2_score(test_scores, reg_y_ortho)
print('Regression results (with orthology):')
print('\tSpearman R = {}'.format(round(r, 4)))
print('\tSpearman p = {:.3g}'.format(p))
print('\tR2 = {}'.format(round(r2, 4)))

# Train and apply a classification-based model
class_model = RandomForestClassifier()
class_model.fit(X_train, train_labels)
class_y = class_model.predict(X_test)
print('Classification results:')
print(classification_report(test_labels, class_y))

# Visualize results
df = pd.DataFrame({
    'x': test_labels,
    'y': reg_y_ortho
}
)
fig = go.Figure()
fig.add_trace(go.Box(y=df.y[df.x=='A'], name='Antagonism', marker_color='red'))
fig.add_trace(go.Box(y=df.y[df.x=='N'], name='Neutral', marker_color='gray'))
fig.add_trace(go.Box(y=df.y[df.x=='S'], name='Synergy', marker_color='blue'))
fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    title='A. baumannii',
    xaxis_title='True Class',
    yaxis_title='Predicted Score',
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)'
)
fig.show()
Regression results:
        Spearman R = 0.6469
        Spearman p = 1.58e-06
        R2 = -0.6263
Regression results (with orthology):
        Spearman R = 0.5968
        Spearman p = 1.51e-05
        R2 = -0.3828
Classification results:
              precision    recall  f1-score   support

           A       0.47      0.82      0.60        17
           N       0.17      0.09      0.12        11
           S       0.78      0.41      0.54        17

    accuracy                           0.49        45
   macro avg       0.47      0.44      0.42        45
weighted avg       0.51      0.49      0.46        45

Data type cannot be displayed: application/vnd.plotly.v1+json