guides

The submodule that contains the guides, i.e., the weak learners in DebiasedDTA that learn a weighting of the training set to improve generalizability. The implemented guides are IDDTA and BoWDTA, and an abstract classes is also available to quickly implement custom guides.

`Guide`

Bases: ABC

An abstract class that implements the interface of a guide in pydebiaseddta. The guides are characterized by a train function and a predict function, whose signatures are implemented by this class. Any instance of the Guide class can be trained in the DebiasedDTA training framework, and therefore, Guide can be inherited to design custom guide models.

Source code in pydebiaseddta/guides/abstract_guide.py

class Guide(ABC):
    """An abstract class that implements the interface of a guide in `pydebiaseddta`.
    The guides are characterized by a `train` function and a `predict` function, 
    whose signatures are implemented by this class. 
    Any instance of the `Guide` class can be trained in the `DebiasedDTA` training framework,
    and therefore, `Guide` can be inherited to design custom guide models.
    """

    @abstractmethod
    def train(
        train_ligands: List[Any], train_proteins: List[Any], train_labels: List[float],
    ):
        """An abstract method to define the training interface of the guides.

        Parameters
        ----------
        train_ligands : List[Any]
            Training ligands in any representation.
        train_proteins : List[Any]
            Training proteins in any representation.
        train_labels : List[float]
            Affinity scores of the training protein-ligand pairs.
        """
        pass

    @abstractmethod
    def predict(ligands: List[Any], proteins: List[Any]) -> List[float]:
        """An abstract method to define the prediction interface of the guides.

        Parameters
        ----------
        ligands : List[Any]
            Ligands in any representation.
        proteins : List[Any]
            Proteins in any representation.

        Returns
        -------
        List[float]
            The predicted affinities.
        """
        pass

`predict(ligands, proteins)` `abstractmethod`

An abstract method to define the prediction interface of the guides.

Parameters:

Name	Type	Description	Default
`ligands`	`List[Any]`	Ligands in any representation.	required
`proteins`	`List[Any]`	Proteins in any representation.	required

Returns:

Type	Description
`List[float]`	The predicted affinities.

Source code in pydebiaseddta/guides/abstract_guide.py

@abstractmethod
def predict(ligands: List[Any], proteins: List[Any]) -> List[float]:
    """An abstract method to define the prediction interface of the guides.

    Parameters
    ----------
    ligands : List[Any]
        Ligands in any representation.
    proteins : List[Any]
        Proteins in any representation.

    Returns
    -------
    List[float]
        The predicted affinities.
    """
    pass

`train(train_ligands, train_proteins, train_labels)` `abstractmethod`

An abstract method to define the training interface of the guides.

Parameters:

Name	Type	Description	Default
`train_ligands`	`List[Any]`	Training ligands in any representation.	required
`train_proteins`	`List[Any]`	Training proteins in any representation.	required
`train_labels`	`List[float]`	Affinity scores of the training protein-ligand pairs.	required

Source code in pydebiaseddta/guides/abstract_guide.py

@abstractmethod
def train(
    train_ligands: List[Any], train_proteins: List[Any], train_labels: List[float],
):
    """An abstract method to define the training interface of the guides.

    Parameters
    ----------
    train_ligands : List[Any]
        Training ligands in any representation.
    train_proteins : List[Any]
        Training proteins in any representation.
    train_labels : List[float]
        Affinity scores of the training protein-ligand pairs.
    """
    pass

`BoWDTA`

Bases: Guide

Source code in pydebiaseddta/guides/bowdta.py

class BoWDTA(Guide):
    def __init__(self):
        """Constructor to create a BoWDTA model.
        BoWDTA represents the proteins and ligands as "bag-of-words`
        and uses a decision tree for prediction. BoWDTA uses the same biomolecule vocabulary
        as BPEDTA.
        """        
        self.ligand_bow_vectorizer = Tokenizer(
            filters=None, lower=False, oov_token="C"
        )
        self.protein_bow_vectorizer = Tokenizer(
            filters=None, lower=False, oov_token="$"
        )
        self.prediction_model = DecisionTreeRegressor()

    def tokenize_ligands(self, smiles: List[str]) -> List[List[int]]:
        """Segments SMILES strings of the ligands into their ligand words and applies label encoding.

        Parameters
        ----------
        smiles : List[str]
            The SMILES strings of the ligands

        Returns
        -------
        List[List[int]]
            Label encoded sequences of ligand words.
        """
        smi_to_unichar_encoding = load_smiles_to_unichar_encoding()
        unichars = smiles_to_unichar_batch(smiles, smi_to_unichar_encoding)
        word_identifier = load_ligand_word_identifier(vocab_size=8000)

        return word_identifier.encode_sequences(unichars, 100)

    def tokenize_proteins(self, aa_sequences: List[str]) -> List[List[int]]:
        """Segments amino-acid sequences of the proteins into their protein words and applies label encoding.

        Parameters
        ----------
        aa_sequences : List[str]
            The amino-acid sequences of the proteins.

        Returns
        -------
        List[List[int]]
            Label encoded sequences of protein words.
        """
        word_identifier = load_protein_word_identifier(vocab_size=32000)
        return word_identifier.encode_sequences(aa_sequences, 1000)

    def vectorize_ligands(self, smiles_words: List[List[int]]) -> np.array:
        """Computes bag-of-words vectors of the ligands based on their frequency.

        Parameters
        ----------
        smiles_words : List[List[int]]
            ligand words of each ligand as a sequence of sequences.

        Returns
        -------
        np.array
            Bag-of-words vectors stacked in a matrix.
        """        
        return self.ligand_bow_vectorizer.texts_to_matrix(
            smiles_words, mode="freq"
        )

    def vectorize_proteins(self, protein_words: List[List[int]]) -> np.array:
        """Computes bag-of-words vectors of the proteins based on their frequency.

        Parameters
        ----------
        protein_words : List[List[int]]
            Protein words of each protein as a sequence of sequences.

        Returns
        -------
        np.array
            Bag-of-words vectors stacked in a matrix.
        """  
        return self.protein_bow_vectorizer.texts_to_matrix(
            protein_words, mode="freq"
        )

    def train(
        self,
        train_ligands: List[str],
        train_proteins: List[str],
        train_labels: List[float],
    ):
        """Trains a BoWDTA model on the provided protein-ligand interactions.
        The biomolecules are represented as bag of their biomolecule words and a
        decision tree is used for affinity prediction.

        Parameters
        ----------
        train_ligands : List[str]
            SMILES strings of the training ligands.
        train_proteins : List[str]
            Amino-acid sequences of the training ligands.
        train_labels : List[float]
            Affinity scores of the training interactions.
        """    
        tokenized_ligands = self.tokenize_ligands(train_ligands)
        tokenized_proteins = self.tokenize_proteins(train_proteins)
        self.ligand_bow_vectorizer.fit_on_texts(tokenized_ligands)
        self.protein_bow_vectorizer.fit_on_texts(tokenized_proteins)

        ligand_vectors = self.vectorize_ligands(tokenized_ligands)
        protein_vectors = self.vectorize_proteins(tokenized_proteins)
        X_train = np.hstack([ligand_vectors, protein_vectors])
        self.prediction_model.fit(X_train, train_labels)

    def predict(
        self, ligands: List[str], proteins: List[str]
    ) -> List[float]:
        """Predicts the affinities of a list of protein-ligand pairs.

        Parameters
        ----------
        ligands : List[str]
            SMILES strings of the ligands.
        proteins : List[str]
            Amino-acid sequences of the proteins.

        Returns
        -------
        List[float]
            Predicted affinities.
        """    
        tokenized_ligands = self.tokenize_ligands(ligands)
        tokenized_proteins = self.tokenize_proteins(proteins)

        ligand_vectors = self.vectorize_ligands(tokenized_ligands)
        protein_vectors = self.vectorize_proteins(tokenized_proteins)

        interaction = np.hstack([ligand_vectors, protein_vectors])
        return self.prediction_model.predict(interaction).tolist()

`init()`

Constructor to create a BoWDTA model. BoWDTA represents the proteins and ligands as "bag-of-words` and uses a decision tree for prediction. BoWDTA uses the same biomolecule vocabulary as BPEDTA.

Source code in pydebiaseddta/guides/bowdta.py

def __init__(self):
    """Constructor to create a BoWDTA model.
    BoWDTA represents the proteins and ligands as "bag-of-words`
    and uses a decision tree for prediction. BoWDTA uses the same biomolecule vocabulary
    as BPEDTA.
    """        
    self.ligand_bow_vectorizer = Tokenizer(
        filters=None, lower=False, oov_token="C"
    )
    self.protein_bow_vectorizer = Tokenizer(
        filters=None, lower=False, oov_token="$"
    )
    self.prediction_model = DecisionTreeRegressor()

`predict(ligands, proteins)`

Predicts the affinities of a list of protein-ligand pairs.

Parameters:

Name	Type	Description	Default
`ligands`	`List[str]`	SMILES strings of the ligands.	required
`proteins`	`List[str]`	Amino-acid sequences of the proteins.	required

Returns:

Type	Description
`List[float]`	Predicted affinities.

Source code in pydebiaseddta/guides/bowdta.py

def predict(
    self, ligands: List[str], proteins: List[str]
) -> List[float]:
    """Predicts the affinities of a list of protein-ligand pairs.

    Parameters
    ----------
    ligands : List[str]
        SMILES strings of the ligands.
    proteins : List[str]
        Amino-acid sequences of the proteins.

    Returns
    -------
    List[float]
        Predicted affinities.
    """    
    tokenized_ligands = self.tokenize_ligands(ligands)
    tokenized_proteins = self.tokenize_proteins(proteins)

    ligand_vectors = self.vectorize_ligands(tokenized_ligands)
    protein_vectors = self.vectorize_proteins(tokenized_proteins)

    interaction = np.hstack([ligand_vectors, protein_vectors])
    return self.prediction_model.predict(interaction).tolist()

`tokenize_ligands(smiles)`

Segments SMILES strings of the ligands into their ligand words and applies label encoding.

Parameters:

Name	Type	Description	Default
`smiles`	`List[str]`	The SMILES strings of the ligands	required

Returns:

Type	Description
`List[List[int]]`	Label encoded sequences of ligand words.

Source code in pydebiaseddta/guides/bowdta.py

def tokenize_ligands(self, smiles: List[str]) -> List[List[int]]:
    """Segments SMILES strings of the ligands into their ligand words and applies label encoding.

    Parameters
    ----------
    smiles : List[str]
        The SMILES strings of the ligands

    Returns
    -------
    List[List[int]]
        Label encoded sequences of ligand words.
    """
    smi_to_unichar_encoding = load_smiles_to_unichar_encoding()
    unichars = smiles_to_unichar_batch(smiles, smi_to_unichar_encoding)
    word_identifier = load_ligand_word_identifier(vocab_size=8000)

    return word_identifier.encode_sequences(unichars, 100)

`tokenize_proteins(aa_sequences)`

Segments amino-acid sequences of the proteins into their protein words and applies label encoding.

Parameters:

Name	Type	Description	Default
`aa_sequences`	`List[str]`	The amino-acid sequences of the proteins.	required

Returns:

Type	Description
`List[List[int]]`	Label encoded sequences of protein words.

Source code in pydebiaseddta/guides/bowdta.py

def tokenize_proteins(self, aa_sequences: List[str]) -> List[List[int]]:
    """Segments amino-acid sequences of the proteins into their protein words and applies label encoding.

    Parameters
    ----------
    aa_sequences : List[str]
        The amino-acid sequences of the proteins.

    Returns
    -------
    List[List[int]]
        Label encoded sequences of protein words.
    """
    word_identifier = load_protein_word_identifier(vocab_size=32000)
    return word_identifier.encode_sequences(aa_sequences, 1000)

`train(train_ligands, train_proteins, train_labels)`

Trains a BoWDTA model on the provided protein-ligand interactions. The biomolecules are represented as bag of their biomolecule words and a decision tree is used for affinity prediction.

Parameters:

Name	Type	Description	Default
`train_ligands`	`List[str]`	SMILES strings of the training ligands.	required
`train_proteins`	`List[str]`	Amino-acid sequences of the training ligands.	required
`train_labels`	`List[float]`	Affinity scores of the training interactions.	required

Source code in pydebiaseddta/guides/bowdta.py

def train(
    self,
    train_ligands: List[str],
    train_proteins: List[str],
    train_labels: List[float],
):
    """Trains a BoWDTA model on the provided protein-ligand interactions.
    The biomolecules are represented as bag of their biomolecule words and a
    decision tree is used for affinity prediction.

    Parameters
    ----------
    train_ligands : List[str]
        SMILES strings of the training ligands.
    train_proteins : List[str]
        Amino-acid sequences of the training ligands.
    train_labels : List[float]
        Affinity scores of the training interactions.
    """    
    tokenized_ligands = self.tokenize_ligands(train_ligands)
    tokenized_proteins = self.tokenize_proteins(train_proteins)
    self.ligand_bow_vectorizer.fit_on_texts(tokenized_ligands)
    self.protein_bow_vectorizer.fit_on_texts(tokenized_proteins)

    ligand_vectors = self.vectorize_ligands(tokenized_ligands)
    protein_vectors = self.vectorize_proteins(tokenized_proteins)
    X_train = np.hstack([ligand_vectors, protein_vectors])
    self.prediction_model.fit(X_train, train_labels)

`vectorize_ligands(smiles_words)`

Computes bag-of-words vectors of the ligands based on their frequency.

Parameters:

Name	Type	Description	Default
`smiles_words`	`List[List[int]]`	ligand words of each ligand as a sequence of sequences.	required

Returns:

Type	Description
`np.array`	Bag-of-words vectors stacked in a matrix.

Source code in pydebiaseddta/guides/bowdta.py

def vectorize_ligands(self, smiles_words: List[List[int]]) -> np.array:
    """Computes bag-of-words vectors of the ligands based on their frequency.

    Parameters
    ----------
    smiles_words : List[List[int]]
        ligand words of each ligand as a sequence of sequences.

    Returns
    -------
    np.array
        Bag-of-words vectors stacked in a matrix.
    """        
    return self.ligand_bow_vectorizer.texts_to_matrix(
        smiles_words, mode="freq"
    )

`vectorize_proteins(protein_words)`

Computes bag-of-words vectors of the proteins based on their frequency.

Parameters:

Name	Type	Description	Default
`protein_words`	`List[List[int]]`	Protein words of each protein as a sequence of sequences.	required

Returns:

Type	Description
`np.array`	Bag-of-words vectors stacked in a matrix.

Source code in pydebiaseddta/guides/bowdta.py

def vectorize_proteins(self, protein_words: List[List[int]]) -> np.array:
    """Computes bag-of-words vectors of the proteins based on their frequency.

    Parameters
    ----------
    protein_words : List[List[int]]
        Protein words of each protein as a sequence of sequences.

    Returns
    -------
    np.array
        Bag-of-words vectors stacked in a matrix.
    """  
    return self.protein_bow_vectorizer.texts_to_matrix(
        protein_words, mode="freq"
    )

`IDDTA`

Bases: Guide

Source code in pydebiaseddta/guides/iddta.py

class IDDTA(Guide):
    def __init__(self):
        """Constructor to create an IDDTA model.
        IDDTA represents the proteins and ligands with one-hot vectors of their identities
        and uses a decision tree for prediction. 
        """
        self.prediction_model = DecisionTreeRegressor()
        self.ligand_encoder = OneHotEncoder(handle_unknown="ignore")
        self.protein_encoder = OneHotEncoder(handle_unknown="ignore")

    def vectorize_ligands(self, ligands: List[str]) -> np.array:
        """Creates one-hot vectors of the ligands.

        Parameters
        ----------
        ligands : List[str]
            SMILES strings of the input ligands (other representations are also possible, but SMILES is used in this study).

        Returns
        -------
        np.array
            One-hot encoded vectors of the ligands.
        """
        ligands = np.array(ligands).reshape(-1, 1)
        return self.ligand_encoder.transform(ligands).todense()

    def vectorize_proteins(self, proteins: List[str]) -> np.array:
        """Creates one-hot vectors of the proteins.

        Parameters
        ----------
        proteins : List[str]
            Amino-acid sequences of the input proteins.

        Returns
        -------
        np.array
            One-hot encoded vectors of the proteins.
        """
        proteins = np.array(proteins).reshape(-1, 1)
        return self.protein_encoder.transform(proteins).todense()

    def train(
        self,
        train_ligands: List[str],
        train_proteins: List[str],
        train_labels: List[float],
    ):
        """Trains the IDDTA model. IDDTA represents the biomolecules with 
        one-hot-encoding of their identities and applies decision tree for affinity prediction.

        Parameters
        ----------
        train_ligands : List[str]
            SMILES strings of the training ligands.
        train_proteins : List[str]
            Amino-acid sequences of the training proteins.
        train_labels : List[float]
            Affinity scores of the interactions.
        """
        ligand_vecs = self.ligand_encoder.fit_transform(
            _list_to_numpy(train_ligands)
        ).todense()
        protein_vecs = self.protein_encoder.fit_transform(
            _list_to_numpy(train_proteins)
        ).todense()

        X_train = np.hstack([ligand_vecs, protein_vecs])
        self.prediction_model.fit(X_train, train_labels)

    def predict(self, ligands: List[str], proteins: List[str]) -> List[float]:
        """Predicts the affinities of a list of protein-ligand pairs.

        Parameters
        ----------
        ligands : List[str]
            SMILES strings of the ligands.
        proteins : List[str]
            Amino-acid sequences of the proteins.

        Returns
        -------
        List[float]
            Predicted affinities.
        """
        ligand_vecs = self.vectorize_ligands(ligands)
        protein_vecs = self.vectorize_proteins(proteins)
        X_test = np.hstack([ligand_vecs, protein_vecs])
        return self.prediction_model.predict(X_test)

`init()`

Constructor to create an IDDTA model. IDDTA represents the proteins and ligands with one-hot vectors of their identities and uses a decision tree for prediction.

Source code in pydebiaseddta/guides/iddta.py

def __init__(self):
    """Constructor to create an IDDTA model.
    IDDTA represents the proteins and ligands with one-hot vectors of their identities
    and uses a decision tree for prediction. 
    """
    self.prediction_model = DecisionTreeRegressor()
    self.ligand_encoder = OneHotEncoder(handle_unknown="ignore")
    self.protein_encoder = OneHotEncoder(handle_unknown="ignore")

`predict(ligands, proteins)`

Predicts the affinities of a list of protein-ligand pairs.

Parameters:

Name	Type	Description	Default
`ligands`	`List[str]`	SMILES strings of the ligands.	required
`proteins`	`List[str]`	Amino-acid sequences of the proteins.	required

Returns:

Type	Description
`List[float]`	Predicted affinities.

Source code in pydebiaseddta/guides/iddta.py

def predict(self, ligands: List[str], proteins: List[str]) -> List[float]:
    """Predicts the affinities of a list of protein-ligand pairs.

    Parameters
    ----------
    ligands : List[str]
        SMILES strings of the ligands.
    proteins : List[str]
        Amino-acid sequences of the proteins.

    Returns
    -------
    List[float]
        Predicted affinities.
    """
    ligand_vecs = self.vectorize_ligands(ligands)
    protein_vecs = self.vectorize_proteins(proteins)
    X_test = np.hstack([ligand_vecs, protein_vecs])
    return self.prediction_model.predict(X_test)

`train(train_ligands, train_proteins, train_labels)`

Trains the IDDTA model. IDDTA represents the biomolecules with one-hot-encoding of their identities and applies decision tree for affinity prediction.

Parameters:

Name	Type	Description	Default
`train_ligands`	`List[str]`	SMILES strings of the training ligands.	required
`train_proteins`	`List[str]`	Amino-acid sequences of the training proteins.	required
`train_labels`	`List[float]`	Affinity scores of the interactions.	required

Source code in pydebiaseddta/guides/iddta.py

def train(
    self,
    train_ligands: List[str],
    train_proteins: List[str],
    train_labels: List[float],
):
    """Trains the IDDTA model. IDDTA represents the biomolecules with 
    one-hot-encoding of their identities and applies decision tree for affinity prediction.

    Parameters
    ----------
    train_ligands : List[str]
        SMILES strings of the training ligands.
    train_proteins : List[str]
        Amino-acid sequences of the training proteins.
    train_labels : List[float]
        Affinity scores of the interactions.
    """
    ligand_vecs = self.ligand_encoder.fit_transform(
        _list_to_numpy(train_ligands)
    ).todense()
    protein_vecs = self.protein_encoder.fit_transform(
        _list_to_numpy(train_proteins)
    ).todense()

    X_train = np.hstack([ligand_vecs, protein_vecs])
    self.prediction_model.fit(X_train, train_labels)

`vectorize_ligands(ligands)`

Creates one-hot vectors of the ligands.

Parameters:

Name	Type	Description	Default
`ligands`	`List[str]`	SMILES strings of the input ligands (other representations are also possible, but SMILES is used in this study).	required

Returns:

Type	Description
`np.array`	One-hot encoded vectors of the ligands.

Source code in pydebiaseddta/guides/iddta.py

def vectorize_ligands(self, ligands: List[str]) -> np.array:
    """Creates one-hot vectors of the ligands.

    Parameters
    ----------
    ligands : List[str]
        SMILES strings of the input ligands (other representations are also possible, but SMILES is used in this study).

    Returns
    -------
    np.array
        One-hot encoded vectors of the ligands.
    """
    ligands = np.array(ligands).reshape(-1, 1)
    return self.ligand_encoder.transform(ligands).todense()

`vectorize_proteins(proteins)`

Creates one-hot vectors of the proteins.

Parameters:

Name	Type	Description	Default
`proteins`	`List[str]`	Amino-acid sequences of the input proteins.	required

Returns:

Type	Description
`np.array`	One-hot encoded vectors of the proteins.

Source code in pydebiaseddta/guides/iddta.py

def vectorize_proteins(self, proteins: List[str]) -> np.array:
    """Creates one-hot vectors of the proteins.

    Parameters
    ----------
    proteins : List[str]
        Amino-acid sequences of the input proteins.

    Returns
    -------
    np.array
        One-hot encoded vectors of the proteins.
    """
    proteins = np.array(proteins).reshape(-1, 1)
    return self.protein_encoder.transform(proteins).todense()

guides

Guide

predict(ligands, proteins) abstractmethod

train(train_ligands, train_proteins, train_labels) abstractmethod

BoWDTA

__init__()

predict(ligands, proteins)

tokenize_ligands(smiles)

tokenize_proteins(aa_sequences)

train(train_ligands, train_proteins, train_labels)

vectorize_ligands(smiles_words)

vectorize_proteins(protein_words)

IDDTA

__init__()

predict(ligands, proteins)

train(train_ligands, train_proteins, train_labels)

vectorize_ligands(ligands)

vectorize_proteins(proteins)

`Guide`

`predict(ligands, proteins)` `abstractmethod`

`train(train_ligands, train_proteins, train_labels)` `abstractmethod`

`BoWDTA`

`init()`

`predict(ligands, proteins)`

`tokenize_ligands(smiles)`

`tokenize_proteins(aa_sequences)`

`train(train_ligands, train_proteins, train_labels)`

`vectorize_ligands(smiles_words)`

`vectorize_proteins(protein_words)`

`IDDTA`

`init()`

`predict(ligands, proteins)`

`train(train_ligands, train_proteins, train_labels)`

`vectorize_ligands(ligands)`

`vectorize_proteins(proteins)`