sequence

The submodule for processing SMILES strings. smiles_processing.py consists of utility function to segment SMILES strings, whereas word_identification.py consists of a class to learn biomolecule words and segment biomolecule sequences into biomolecule words.

`segment_smiles(smiles, segment_sq_brackets=True)`

Segments a SMILES string into its tokens.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	Input SMILES string.	required
`segment_sq_brackets`	`bool, optional`	Whether to segment expressions within square brackets (e.g. [C@@H], [Rb]), too. Set to `True` to have square brackets and the tokens inside as standalone tokens, e.g. ["[", "C", "@", "@", "H", "]"]. When set to `False`, whole expression is returned as a single token, e.g. "[C@@H]" . Defaults to `True`.	`True`

Returns:

Type	Description
`List[str]`	Each element of the SMILES string as a list.

Source code in pydebiaseddta/sequence/smiles_processing.py

def segment_smiles(smiles: str, segment_sq_brackets: bool = True) -> List[str]:
    """Segments a SMILES string into its tokens.

    Parameters
    ----------
    smiles : str
        Input SMILES string.
    segment_sq_brackets : bool, optional
        Whether to segment expressions within square brackets (*e.g.* [C@@H], [Rb]), too. 
        Set to `True` to have square brackets and the tokens inside as standalone tokens,
        *e.g.* ["[", "C", "@", "@", "H", "]"]. 
        When set to `False`, whole expression is returned as a single token, *e.g.* "[C@@H]" .
        Defaults to `True`.

    Returns
    -------
    List[str]
        Each element of the SMILES string as a list.
    """
    regex = _RE_PATTERNS["segmentation_sq"]
    if not segment_sq_brackets:
        regex = _RE_PATTERNS["segmentation"]
    return regex.findall(smiles)

`segment_smiles_batch(smiles_batch, segment_sq_brackets=True)`

Segments multiple SMILES strings with a single call by wrapping sequence.smiles_processing.segment_smiles.

Parameters:

Name	Type	Description	Default
`smiles_batch`	`List[str]`	List of input SMILES strings.	required
`segment_sq_brackets`	`bool, optional`	Whether to segment expressions within square brackets. See `sequence.smiles_processing.segment_smiles` for a more detailed explanation. Defaults to `True`.	`True`

Returns:

Type	Description
`List[List[str]]`	A 2D list of strings where element \([i][j]\) corresponds to the \(j^{th}\) token of the \(i^{th}\) input.

Source code in pydebiaseddta/sequence/smiles_processing.py

def segment_smiles_batch(
    smiles_batch: List[str], segment_sq_brackets=True
) -> List[List[str]]:
    """Segments multiple SMILES strings with a single call by wrapping `sequence.smiles_processing.segment_smiles`.

    Parameters
    ----------
    smiles_batch : List[str]
        List of input SMILES strings.
    segment_sq_brackets : bool, optional
        Whether to segment expressions within square brackets. 
        See `sequence.smiles_processing.segment_smiles` for a more detailed explanation.
        Defaults to `True`.

    Returns
    -------
    List[List[str]]
        A 2D list of strings where element $[i][j]$ corresponds to the $j^{th}$ token of the $i^{th}$ input.
    """
    return [segment_smiles(smiles, segment_sq_brackets) for smiles in smiles_batch]

`WordIdentifier`

A versatile class to identify biomolecule words in biomolecule strings. WordIdentifier leverages the Byte Pair Encoding algorithm implemented in the tokenizers library to learn biomolecule vocabularies and segment biomolecule strings into their words.

Source code in pydebiaseddta/sequence/word_identification.py

class WordIdentifier:
    """A versatile class to identify biomolecule words in biomolecule strings. 
    `WordIdentifier` leverages the Byte Pair Encoding algorithm implemented in the `tokenizers` library
    to learn biomolecule vocabularies and segment biomolecule strings into their words. 
    """    
    def __init__(self, vocab_size: int):
        """Creates a `WordIdentifier` instance.

        Parameters
        ----------
        vocab_size : int
            Size of the biomolecule vocabulary.
        """        
        self.vocab_size = vocab_size
        self.tokenizer = Tokenizer(BPE())
        self.tokenizer.pre_tokenizer = Whitespace()

    @classmethod
    def from_file(cls, loadpath: str):
        """Loads a `WordIdentifier` from a file.

        Parameters
        ----------
        loadpath : str
            Path to the `WordIdentifier` file.

        Returns
        -------
        WordIdentifier
            Previously saved `WordIdentifier`
        """        
        if not loadpath.endswith(FILE_EXTENSION):
            loadpath = loadpath + FILE_EXTENSION

        dct = load_json(loadpath)
        vocab_size = len(dct["model"]["vocab"])
        instance = cls(vocab_size)
        instance.tokenizer = Tokenizer.from_str(json.dumps(dct))
        return instance

    def train(self, corpus_path: str):
        """Learns a biomolecule vocabulary from a file of biomolecule strings using Byte Pair Encoding Algorithm. 

        Parameters
        ----------
        corpus_path : str
            Path to the corpus of biomolecule strings. The corpus file must contain a biomolecule string per line.
        """        
        trainer = BpeTrainer(
            vocab_size=self.vocab_size, special_tokens=["[PAD]"]
        )
        self.tokenizer.train([corpus_path], trainer)
        if self.tokenizer.get_vocab_size() < self.vocab_size:
            print(
                f"Warning: The iterations stopped before the desired vocab size is reached. Learned vocab size={self.tokenizer.get_vocab_size()}. Desired size={self.vocab_size}"
            )

    def tokenize_sequences(self, sequences: List[str]) -> List[List[str]]:
        """Segments a List of biomolecule strings into biomolecule words via the learned vocabulary.

        Parameters
        ----------
        sequences : List[str]
            The List of biomolecule strings.

        Returns
        -------
        List[List[str]]
            List of biomolecule words of each input string.
        """        
        encodings = self.tokenizer.encode_batch(sequences)
        return [encoding.tokens for encoding in encodings]

    def encode_sequences(self, sequences: List[str], padding_len: int = None)-> List[List[int]]:
        """Segments a List of biomolecule strings into biomolecule words via the learned vocabulary and 
        returns the id of the biomolecule word, which is convenient to apply label encoding in the subsequent steps.
        Padding support is also available to ease training deep learning possible.

        Parameters
        ----------
        sequences : List[str]
            The List of biomolecule strings.
        padding_len : int, optional
            The desired length of sequences, by default `None`. No padding is applied when set to `None`.

        Returns
        -------
        List[List[int]]
            List of the id of the biomolecule words of each input string.
        """        
        encodings = self.tokenizer.encode_batch(sequences)
        if isinstance(padding_len, int):
            for encoding in encodings:
                encoding.pad(
                    padding_len, direction="right", pad_id=0, pad_token="[PAD]"
                )
                encoding.truncate(padding_len)

        return [encoding.ids for encoding in encodings]

    def save(self, savepath: str):
        """Saves a `WordIdentifier` instance to disk.

        Parameters
        ----------
        savepath : str
            The path to dump the instance. File extension is added automatically.
        """        
        if not savepath.endswith(FILE_EXTENSION):
            savepath = savepath + FILE_EXTENSION
        save_json(json.loads(self.tokenizer.to_str()), savepath)

`init(vocab_size)`

Creates a WordIdentifier instance.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	Size of the biomolecule vocabulary.	required

Source code in pydebiaseddta/sequence/word_identification.py

def __init__(self, vocab_size: int):
    """Creates a `WordIdentifier` instance.

    Parameters
    ----------
    vocab_size : int
        Size of the biomolecule vocabulary.
    """        
    self.vocab_size = vocab_size
    self.tokenizer = Tokenizer(BPE())
    self.tokenizer.pre_tokenizer = Whitespace()

`encode_sequences(sequences, padding_len=None)`

Segments a List of biomolecule strings into biomolecule words via the learned vocabulary and returns the id of the biomolecule word, which is convenient to apply label encoding in the subsequent steps. Padding support is also available to ease training deep learning possible.

Parameters:

Name	Type	Description	Default
`sequences`	`List[str]`	The List of biomolecule strings.	required
`padding_len`	`int, optional`	The desired length of sequences, by default `None`. No padding is applied when set to `None`.	`None`

Returns:

Type	Description
`List[List[int]]`	List of the id of the biomolecule words of each input string.

Source code in pydebiaseddta/sequence/word_identification.py

def encode_sequences(self, sequences: List[str], padding_len: int = None)-> List[List[int]]:
    """Segments a List of biomolecule strings into biomolecule words via the learned vocabulary and 
    returns the id of the biomolecule word, which is convenient to apply label encoding in the subsequent steps.
    Padding support is also available to ease training deep learning possible.

    Parameters
    ----------
    sequences : List[str]
        The List of biomolecule strings.
    padding_len : int, optional
        The desired length of sequences, by default `None`. No padding is applied when set to `None`.

    Returns
    -------
    List[List[int]]
        List of the id of the biomolecule words of each input string.
    """        
    encodings = self.tokenizer.encode_batch(sequences)
    if isinstance(padding_len, int):
        for encoding in encodings:
            encoding.pad(
                padding_len, direction="right", pad_id=0, pad_token="[PAD]"
            )
            encoding.truncate(padding_len)

    return [encoding.ids for encoding in encodings]

`from_file(loadpath)` `classmethod`

Loads a WordIdentifier from a file.

Parameters:

Name	Type	Description	Default
`loadpath`	`str`	Path to the `WordIdentifier` file.	required

Returns:

Type	Description
`WordIdentifier`	Previously saved `WordIdentifier`

Source code in pydebiaseddta/sequence/word_identification.py

@classmethod
def from_file(cls, loadpath: str):
    """Loads a `WordIdentifier` from a file.

    Parameters
    ----------
    loadpath : str
        Path to the `WordIdentifier` file.

    Returns
    -------
    WordIdentifier
        Previously saved `WordIdentifier`
    """        
    if not loadpath.endswith(FILE_EXTENSION):
        loadpath = loadpath + FILE_EXTENSION

    dct = load_json(loadpath)
    vocab_size = len(dct["model"]["vocab"])
    instance = cls(vocab_size)
    instance.tokenizer = Tokenizer.from_str(json.dumps(dct))
    return instance

`save(savepath)`

Saves a WordIdentifier instance to disk.

Parameters:

Name	Type	Description	Default
`savepath`	`str`	The path to dump the instance. File extension is added automatically.	required

Source code in pydebiaseddta/sequence/word_identification.py

def save(self, savepath: str):
    """Saves a `WordIdentifier` instance to disk.

    Parameters
    ----------
    savepath : str
        The path to dump the instance. File extension is added automatically.
    """        
    if not savepath.endswith(FILE_EXTENSION):
        savepath = savepath + FILE_EXTENSION
    save_json(json.loads(self.tokenizer.to_str()), savepath)

`tokenize_sequences(sequences)`

Segments a List of biomolecule strings into biomolecule words via the learned vocabulary.

Parameters:

Name	Type	Description	Default
`sequences`	`List[str]`	The List of biomolecule strings.	required

Returns:

Type	Description
`List[List[str]]`	List of biomolecule words of each input string.

Source code in pydebiaseddta/sequence/word_identification.py

def tokenize_sequences(self, sequences: List[str]) -> List[List[str]]:
    """Segments a List of biomolecule strings into biomolecule words via the learned vocabulary.

    Parameters
    ----------
    sequences : List[str]
        The List of biomolecule strings.

    Returns
    -------
    List[List[str]]
        List of biomolecule words of each input string.
    """        
    encodings = self.tokenizer.encode_batch(sequences)
    return [encoding.tokens for encoding in encodings]

`train(corpus_path)`

Learns a biomolecule vocabulary from a file of biomolecule strings using Byte Pair Encoding Algorithm.

Parameters:

Name	Type	Description	Default
`corpus_path`	`str`	Path to the corpus of biomolecule strings. The corpus file must contain a biomolecule string per line.	required

Source code in pydebiaseddta/sequence/word_identification.py

def train(self, corpus_path: str):
    """Learns a biomolecule vocabulary from a file of biomolecule strings using Byte Pair Encoding Algorithm. 

    Parameters
    ----------
    corpus_path : str
        Path to the corpus of biomolecule strings. The corpus file must contain a biomolecule string per line.
    """        
    trainer = BpeTrainer(
        vocab_size=self.vocab_size, special_tokens=["[PAD]"]
    )
    self.tokenizer.train([corpus_path], trainer)
    if self.tokenizer.get_vocab_size() < self.vocab_size:
        print(
            f"Warning: The iterations stopped before the desired vocab size is reached. Learned vocab size={self.tokenizer.get_vocab_size()}. Desired size={self.vocab_size}"
        )

`load_chemical_word_identifier(vocab_size)`

A convenience function to load word vocabularies learned for SMILES strings in the study. The possible vocabularies to load are for DeepDTA and BPE-DTA.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	Size of the learned SMILES word vocabulary. The allowed values are 94 and 8000, for DeepDTA and BPE-DTA, respectively.	required

Returns:

Type	Description
`type[WordIdentifier]`	The `WordIdentifier` instance used by the DTA models.

Raises:

Type	Description
`ValueError`	If vocabulary size besides 94 and 8000 is passed, a `ValueError` is raised.

Source code in pydebiaseddta/sequence/word_identification.py

def load_chemical_word_identifier(vocab_size: int) -> WordIdentifier:
    """A convenience function to load word vocabularies learned for SMILES strings in the study.
    The possible vocabularies to load are for DeepDTA and BPE-DTA. 

    Parameters
    ----------
    vocab_size : int
        Size of the learned SMILES word vocabulary. The allowed values are 94 and 8000, for DeepDTA and BPE-DTA, respectively.

    Returns
    -------
    type[WordIdentifier]
        The `WordIdentifier` instance used by the DTA models.

    Raises
    ------
    ValueError
        If vocabulary size besides 94 and 8000 is passed, a `ValueError` is raised.
    """    
    if vocab_size not in [94, 8000]:
        raise ValueError("Supported vocab sizes are 94 and 8000")

    protein_vocab_path = f"{package_path}/data/word_identification/chemical"
    vocab_path = f"{protein_vocab_path}/chembl27_enc_94.json"
    if vocab_size == 8000:
        vocab_path = f"{protein_vocab_path}/chembl27_enc_bpe_8000.json"

    return WordIdentifier.from_file(vocab_path)

`load_protein_word_identifier(vocab_size)`

A convenience function to load word vocabularies learned for amino-acid sequences in the study. The possible vocabularies to load are for DeepDTA and BPE-DTA.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	Size of the learned SMILES word vocabulary. The allowed values are 26 and 32000, for DeepDTA and BPE-DTA, respectively.	required

Returns:

Type	Description
`type[WordIdentifier]`	The `WordIdentifier` instance used by the DTA models.

Raises:

Type	Description
`ValueError`	If vocabulary size besides 26 and 32000 is passed, a `ValueError` is raised.

Source code in pydebiaseddta/sequence/word_identification.py

def load_protein_word_identifier(vocab_size: int)-> WordIdentifier:
    """A convenience function to load word vocabularies learned for amino-acid sequences in the study.
    The possible vocabularies to load are for DeepDTA and BPE-DTA. 

    Parameters
    ----------
    vocab_size : int
        Size of the learned SMILES word vocabulary. The allowed values are 26 and 32000, for DeepDTA and BPE-DTA, respectively.

    Returns
    -------
    type[WordIdentifier]
        The `WordIdentifier` instance used by the DTA models.

    Raises
    ------
    ValueError
        If vocabulary size besides 26 and 32000 is passed, a `ValueError` is raised.
    """    
    if vocab_size not in [26, 32000]:
        raise ValueError("Supported vocab sizes are 26 and 32000")

    protein_vocab_path = f"{package_path}/data/word_identification/protein"
    vocab_path = f"{protein_vocab_path}/uniprot_26.json"
    if vocab_size == 32000:
        vocab_path = f"{protein_vocab_path}/uniprot_bpe_32000.json"

    return WordIdentifier.from_file(vocab_path)

sequence

segment_smiles(smiles, segment_sq_brackets=True)

segment_smiles_batch(smiles_batch, segment_sq_brackets=True)

WordIdentifier

__init__(vocab_size)

encode_sequences(sequences, padding_len=None)

from_file(loadpath) classmethod

save(savepath)

tokenize_sequences(sequences)

train(corpus_path)

load_chemical_word_identifier(vocab_size)

load_protein_word_identifier(vocab_size)

`segment_smiles(smiles, segment_sq_brackets=True)`

`segment_smiles_batch(smiles_batch, segment_sq_brackets=True)`

`WordIdentifier`

`init(vocab_size)`

`encode_sequences(sequences, padding_len=None)`

`from_file(loadpath)` `classmethod`

`save(savepath)`

`tokenize_sequences(sequences)`

`train(corpus_path)`

`load_chemical_word_identifier(vocab_size)`

`load_protein_word_identifier(vocab_size)`