Source code for wsipipe.datasets.camelyon16

"""
This module creates the dataframe for the camelyon 16 dataset with the follwing columns: 
    - The slide column stores the paths on disk of the whole slide images.
    - The annotation column records a path to the annotation files.
    - The label column is the slide level label.
    - The tags column is blank for camelyon 16.

This assumes there is a folder on disk structured the same as downloading 
from the camelyon grand challenge Camelyon 16 google drive:
https://camelyon17.grand-challenge.org/Data/ 

"""

from pathlib import Path

import pandas as pd

[docs]def training(cam16_path: Path = Path("data", "camelyon16"), project_root: Path = None) -> pd.DataFrame:
    """ Create Camleyon 16 training dataset

    This function goes through the input directories for the training slides, 
    and matches up the annotations and slides. 
    It creates a dataframe with slide path with matching annotation path, and slide label.
    There is an empty tags column that is not used for this dataset 

    Args:
        cam16_path (Path, optional): a path relative to the project root that is the location 
            of the Camelyon 16 data. Defaults to data/camelyon16.
    Returns:
        df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags
    """
    # set up the paths to the slides and annotations
    if project_root is None:
        dataset_root = Path(cam16_path) / "training"
    else:
        dataset_root = project_root / Path(cam16_path) / "training"
    annotations_dir = dataset_root / "lesion_annotations"
    tumor_slide_dir = dataset_root / "tumor"
    normal_slide_dir = dataset_root / "normal"

    # all paths are relative to the project root if defined
    if project_root is None:
        annotation_paths = sorted(
            [p for p in annotations_dir.glob("*.xml")]
        )
        tumor_slide_paths = sorted(
            [p for p in tumor_slide_dir.glob("*.tif")]
        )
        normal_slide_paths = sorted(
            [p for p in normal_slide_dir.glob("*.tif")]
        )

    else:
        annotation_paths = sorted(
            [p.relative_to(project_root) for p in annotations_dir.glob("*.xml")]
        )
        tumor_slide_paths = sorted(
            [p.relative_to(project_root) for p in tumor_slide_dir.glob("*.tif")]
        )
        normal_slide_paths = sorted(
            [p.relative_to(project_root) for p in normal_slide_dir.glob("*.tif")]
        )

    # turn them into a data frame and pad with empty annotation paths
    df = pd.DataFrame()
    df["slide"] = tumor_slide_paths + normal_slide_paths
    df["annotation"] = annotation_paths + ["" for _ in range(len(normal_slide_paths))]
    df["label"] = ["tumor"] * len(tumor_slide_paths) + ["normal"] * len(
        normal_slide_paths
    )
    df["tags"] = ""

    return df


[docs]def testing(cam16_path: Path = Path("data", "camelyon16"), project_root: Path = None) -> pd.DataFrame:
    """ Create Camleyon 16 testing dataset
    
    This function goes through the input directories for the testing slides, 
    and matches up the annotations and slides. 
    It creates a dataframe with slide path with matching annotation path, and slide label.
    There is an empty tags column that is not used for this dataset 

    Args:
        cam16_path (Path, optional): a path relative to the project root that is the location 
            of the Camelyon 16 data. Defaults to data/camelyon16.
    Returns:
        df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags
    """
    # set up the paths to the slides and annotations
    if project_root is None:
        dataset_root = Path(cam16_path) / "testing"
    else:
        dataset_root = project_root / Path(cam16_path) / "testing"

    annotations_dir = dataset_root / "lesion_annotations"
    slide_dir = dataset_root / "images"

    # all paths are relative to the dataset 'root' if defined
    if project_root is None:
        slide_paths = sorted([p for p in slide_dir.glob("*.tif")])
        annotation_paths = sorted(
            [p for p in annotations_dir.glob("*.xml")]
        )
    else:
        slide_paths = sorted([p.relative_to(project_root) for p in slide_dir.glob("*.tif")])
        annotation_paths = sorted(
            [p.relative_to(project_root) for p in annotations_dir.glob("*.xml")]
        )

    # get the slide name
    slide_names = [p.stem for p in slide_paths]

    # search for slides with annotations, add the annotation path if it exists else add empty string
    slides_annotations_paths = []
    for name in slide_names:
        a_path = ""
        for anno_path in annotation_paths:
            if name in str(anno_path):
                a_path = anno_path
        slides_annotations_paths.append(a_path)

    # get the slide labels by reading the csv file
    csv_path = dataset_root / "reference.csv"
    label_csv_file = pd.read_csv(csv_path, header=None)
    slide_labels = label_csv_file.iloc[:, 1]

    # turn them into a data frame and pad with empty annotation paths
    df = pd.DataFrame()
    df["slide"] = slide_paths
    df["annotation"] = slides_annotations_paths
    df["label"] = slide_labels
    df["tags"] = ""

    return df