Python API

`bioclip.TreeOfLifeClassifier(**kwargs)`

Bases: BaseClassifier

A classifier for predicting taxonomic ranks for images.

See BaseClassifier for details on **kwargs.

Source code in src/bioclip/predict.py

def __init__(self, **kwargs):
    """
    See `BaseClassifier` for details on `**kwargs`.
    """
    super().__init__(**kwargs)
    self.txt_embeddings = get_txt_emb().to(self.device)
    self.txt_names = get_txt_names()
    self._subset_txt_embeddings = None
    self._subset_txt_names = None

`predict(images, rank, min_prob=1e-09, k=5)`

Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

Parameters:

Name	Type	Description	Default
`images`	`List[str] \| str \| List[Image]`	A list of image file paths, a single image file path, or a list of PIL Image objects.	required
`rank`	`Rank`	The rank at which to make predictions (e.g., species, genus).	required
`min_prob`	`float`	The minimum probability threshold for predictions.	`1e-09`
`k`	`int`	The number of top predictions to return.	`5`

Returns:

Type	Description
`dict[str, dict[str, float]]`	List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".

Source code in src/bioclip/predict.py

@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], rank: Rank, min_prob: float = 1e-9, k: int = 5) -> dict[str, dict[str, float]]:
    """
    Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        rank (Rank): The rank at which to make predictions (e.g., species, genus).
        min_prob (float, optional): The minimum probability threshold for predictions.
        k (int, optional): The number of top predictions to return.

    Returns:
        List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".
    """

    if isinstance(images, str):
        images = [images]
    probs = self.create_probabilities_for_images(images, self.get_txt_embeddings())
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        if rank == Rank.SPECIES:
            result.extend(self.format_species_probs(key, probs[key], k))
        else:
            result.extend(self.format_grouped_probs(key, probs[key], rank, min_prob, k))
    return result

`get_label_data()`

Retrieves label data for the tree of life embeddings as a pandas DataFrame.

Returns:

Type	Description
`DataFrame`	pd.DataFrame: A DataFrame containing label data for TOL embeddings.

Source code in src/bioclip/predict.py

def get_label_data(self) -> pd.DataFrame:
    """
    Retrieves label data for the tree of life embeddings as a pandas DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing label data for TOL embeddings.
    """

    data = []
    for name_ary in self.txt_names:
        data.append(create_classification_dict(names=name_ary, rank=Rank.SPECIES))
    return pd.DataFrame(data, copy=True)

`create_taxa_filter(rank, user_values)`

Creates a filter for taxa based on the specified rank and user-provided values.

Parameters:

Name	Type	Description	Default
`rank`	`Rank`	The taxonomic rank to filter by.	required
`user_values`	`List[str]`	A list of user-provided values to filter the taxa.	required

Returns:

Type	Description
`List[bool]`	List[bool]: A list of boolean values indicating whether each entry in the label data matches any of the user-provided values.

Raises:

Type	Description
`ValueError`	If any of the user-provided values are not found in the label data for the specified taxonomic rank.

Source code in src/bioclip/predict.py

def create_taxa_filter(self, rank: Rank, user_values: List[str]) -> List[bool]:
    """
    Creates a filter for taxa based on the specified rank and user-provided values.

    Args:
        rank (Rank): The taxonomic rank to filter by.
        user_values (List[str]): A list of user-provided values to filter the taxa.

    Returns:
        List[bool]: A list of boolean values indicating whether each entry in the 
                    label data matches any of the user-provided values.

    Raises:
        ValueError: If any of the user-provided values are not found in the label data 
                    for the specified taxonomic rank.
    """

    taxa_column = rank.get_label()
    label_data = self.get_label_data()

    # Ensure all user values exist
    pd_user_values = pd.Series(user_values, name=taxa_column)
    unknown_values = pd_user_values[~pd_user_values.isin(label_data[taxa_column])]
    if not unknown_values.empty:
        bad_species = ", ".join(unknown_values.values)
        raise ValueError(f"Unknown {taxa_column} received: {bad_species}. Only known {taxa_column} may be used.")

    return label_data[taxa_column].isin(pd_user_values)

`apply_filter(keep_labels_ary)`

Filters the TOL embeddings based on the provided boolean array. See create_taxa_filter() for an easy way to create this parameter.

Parameters:

Name	Type	Description	Default
`keep_labels_ary`	`List[bool]`	A list of boolean values indicating which TOL embeddings to keep.	required

Raises:

Type	Description
`ValueError`	If the length of keep_labels_ary does not match the expected length.

Source code in src/bioclip/predict.py

def apply_filter(self, keep_labels_ary: List[bool]):
    """
    Filters the TOL embeddings based on the provided boolean array. See `create_taxa_filter()` for an easy way to create this parameter.

    Args:
        keep_labels_ary (List[bool]): A list of boolean values indicating which 
                                      TOL embeddings to keep.

    Raises:
        ValueError: If the length of keep_labels_ary does not match the expected length.
    """

    if len(keep_labels_ary) != len(self.txt_names):
        expected = len(self.txt_names)
        raise ValueError("Invalid keep_embeddings values. " + 
                         f"This parameter should be a list containing {expected} items.")
    embeddings = []
    names = []
    for idx, keep in enumerate(keep_labels_ary):
        if keep:
            embeddings.append(self.txt_embeddings[:,idx])
            names.append(self.txt_names[idx])
    self._subset_txt_embeddings = torch.stack(embeddings, dim=1)
    self._subset_txt_names = names

`bioclip.Rank`

Rank for the Tree of Life classification.

KINGDOM
PHYLUM
CLASS
ORDER
FAMILY
GENUS
SPECIES

`bioclip.CustomLabelsClassifier(cls_ary, **kwargs)`

Bases: BaseClassifier

A classifier that predicts from a list of custom labels for images.

Initializes the classifier with the given class array and additional keyword arguments.

Parameters:

Name	Type	Description	Default
`cls_ary`	`List[str]`	A list of class names as strings.	required

Source code in src/bioclip/predict.py

def __init__(self, cls_ary: List[str], **kwargs):
    """
    Initializes the classifier with the given class array and additional keyword arguments.

    Parameters:
        cls_ary (List[str]): A list of class names as strings.
    """
    super().__init__(**kwargs)
    self.tokenizer = create_bioclip_tokenizer(self.model_str)
    self.classes = [cls.strip() for cls in cls_ary]
    self.txt_embeddings = self._get_txt_embeddings(self.classes)

`predict(images, k=None)`

Predicts the probabilities for the given images.

Parameters:

Name	Type	Description	Default
`images`	`List[str] \| str \| List[Image]`	A list of image file paths, a single image file path, or a list of PIL Image objects.	required
`k`	`int`	The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.	`None`

Returns:

Type	Description
`dict[str, float]`	List[dict]: A list of dicts with keys "file_name" and the custom class labels.

Source code in src/bioclip/predict.py

@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], k: int = None) -> dict[str, float]:
    """
    Predicts the probabilities for the given images.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        k (int, optional): The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.

    Returns:
        List[dict]: A list of dicts with keys "file_name" and the custom class labels.
    """
    if isinstance(images, str):
        images = [images]
    probs = self.create_probabilities_for_images(images, self.txt_embeddings)
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        img_probs = probs[key]
        if not k or k > len(self.classes):
            k = len(self.classes)
        result.extend(self.group_probs(key, img_probs, k))
    return result

`bioclip.CustomLabelsBinningClassifier(cls_to_bin, **kwargs)`

Bases: CustomLabelsClassifier

A classifier that creates predictions for images based on custom labels, groups the labels, and calculates probabilities for each group.

Initializes the class with a dictionary mapping class labels to binary values.

Parameters:

Name	Type	Description	Default
`cls_to_bin`	`dict`	A dictionary where keys are class labels and values are binary values.	required
`**kwargs`		Additional keyword arguments passed to the superclass initializer.	`{}`

Raises:

Type	Description
`ValueError`	If any value in `cls_to_bin` is empty, null, or NaN.

Source code in src/bioclip/predict.py

def __init__(self, cls_to_bin: dict, **kwargs):
    """
    Initializes the class with a dictionary mapping class labels to binary values.

    Args:
        cls_to_bin (dict): A dictionary where keys are class labels and values are binary values.
        **kwargs: Additional keyword arguments passed to the superclass initializer.

    Raises:
        ValueError: If any value in `cls_to_bin` is empty, null, or NaN.
    """
    super().__init__(cls_ary=cls_to_bin.keys(), **kwargs)
    self.cls_to_bin = cls_to_bin
    if any([pd.isna(x) or not x for x in cls_to_bin.values()]):
        raise ValueError("Empty, null, or nan are not allowed for bin values.")

`bioclip.predict.BaseClassifier(model_str=BIOCLIP_MODEL_STR, pretrained_str=None, device='cpu')`

Bases: Module

Initializes the prediction model.

Parameters:

Name	Type	Description	Default
`model_str`	`str`	The string identifier for the model to be used.	`BIOCLIP_MODEL_STR`
`pretrained_str`	`str`	The string identifier for the pretrained model to be loaded.	`None`
`device`	`Union[str, device]`	The device on which the model will be run.	`'cpu'`

Source code in src/bioclip/predict.py

def __init__(self, model_str: str = BIOCLIP_MODEL_STR, pretrained_str: str | None = None, device: Union[str, torch.device] = 'cpu'):
    """
    Initializes the prediction model.

    Parameters:
        model_str (str): The string identifier for the model to be used.
        pretrained_str (str, optional): The string identifier for the pretrained model to be loaded.
        device (Union[str, torch.device]): The device on which the model will be run.
    """
    super().__init__()
    self.device = device
    self.load_pretrained_model(model_str=model_str, pretrained_str=pretrained_str)

`forward(x)`

Given an input tensor representing multiple images, return probabilities for each class for each image. Args: x (torch.Tensor): Input tensor representing the multiple images. Returns: torch.Tensor: Softmax probabilities of the logits for each class for each image.

Source code in src/bioclip/predict.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """
    Given an input tensor representing multiple images, return probabilities for each class for each image.
    Args:
        x (torch.Tensor): Input tensor representing the multiple images.
    Returns:
        torch.Tensor: Softmax probabilities of the logits for each class for each image.
    """
    img_features = self.model.encode_image(x)
    img_features = F.normalize(img_features, dim=-1)
    return self.create_probabilities(img_features, self.txt_embeddings)