"""
This module creates the dataframe for the camelyon 16 dataset with the follwing columns:
- The slide column stores the paths on disk of the whole slide images.
- The annotation column records a path to the annotation files.
- The label column is the slide level label.
- The tags column is blank for camelyon 16.
This assumes there is a folder on disk structured the same as downloading
from the camelyon grand challenge Camelyon 16 google drive:
https://camelyon17.grand-challenge.org/Data/
"""
from pathlib import Path
import pandas as pd
[docs]def training(cam16_path: Path = Path("data", "camelyon16"), project_root: Path = None) -> pd.DataFrame:
""" Create Camleyon 16 training dataset
This function goes through the input directories for the training slides,
and matches up the annotations and slides.
It creates a dataframe with slide path with matching annotation path, and slide label.
There is an empty tags column that is not used for this dataset
Args:
cam16_path (Path, optional): a path relative to the project root that is the location
of the Camelyon 16 data. Defaults to data/camelyon16.
Returns:
df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags
"""
# set up the paths to the slides and annotations
if project_root is None:
dataset_root = Path(cam16_path) / "training"
else:
dataset_root = project_root / Path(cam16_path) / "training"
annotations_dir = dataset_root / "lesion_annotations"
tumor_slide_dir = dataset_root / "tumor"
normal_slide_dir = dataset_root / "normal"
# all paths are relative to the project root if defined
if project_root is None:
annotation_paths = sorted(
[p for p in annotations_dir.glob("*.xml")]
)
tumor_slide_paths = sorted(
[p for p in tumor_slide_dir.glob("*.tif")]
)
normal_slide_paths = sorted(
[p for p in normal_slide_dir.glob("*.tif")]
)
else:
annotation_paths = sorted(
[p.relative_to(project_root) for p in annotations_dir.glob("*.xml")]
)
tumor_slide_paths = sorted(
[p.relative_to(project_root) for p in tumor_slide_dir.glob("*.tif")]
)
normal_slide_paths = sorted(
[p.relative_to(project_root) for p in normal_slide_dir.glob("*.tif")]
)
# turn them into a data frame and pad with empty annotation paths
df = pd.DataFrame()
df["slide"] = tumor_slide_paths + normal_slide_paths
df["annotation"] = annotation_paths + ["" for _ in range(len(normal_slide_paths))]
df["label"] = ["tumor"] * len(tumor_slide_paths) + ["normal"] * len(
normal_slide_paths
)
df["tags"] = ""
return df
[docs]def testing(cam16_path: Path = Path("data", "camelyon16"), project_root: Path = None) -> pd.DataFrame:
""" Create Camleyon 16 testing dataset
This function goes through the input directories for the testing slides,
and matches up the annotations and slides.
It creates a dataframe with slide path with matching annotation path, and slide label.
There is an empty tags column that is not used for this dataset
Args:
cam16_path (Path, optional): a path relative to the project root that is the location
of the Camelyon 16 data. Defaults to data/camelyon16.
Returns:
df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags
"""
# set up the paths to the slides and annotations
if project_root is None:
dataset_root = Path(cam16_path) / "testing"
else:
dataset_root = project_root / Path(cam16_path) / "testing"
annotations_dir = dataset_root / "lesion_annotations"
slide_dir = dataset_root / "images"
# all paths are relative to the dataset 'root' if defined
if project_root is None:
slide_paths = sorted([p for p in slide_dir.glob("*.tif")])
annotation_paths = sorted(
[p for p in annotations_dir.glob("*.xml")]
)
else:
slide_paths = sorted([p.relative_to(project_root) for p in slide_dir.glob("*.tif")])
annotation_paths = sorted(
[p.relative_to(project_root) for p in annotations_dir.glob("*.xml")]
)
# get the slide name
slide_names = [p.stem for p in slide_paths]
# search for slides with annotations, add the annotation path if it exists else add empty string
slides_annotations_paths = []
for name in slide_names:
a_path = ""
for anno_path in annotation_paths:
if name in str(anno_path):
a_path = anno_path
slides_annotations_paths.append(a_path)
# get the slide labels by reading the csv file
csv_path = dataset_root / "reference.csv"
label_csv_file = pd.read_csv(csv_path, header=None)
slide_labels = label_csv_file.iloc[:, 1]
# turn them into a data frame and pad with empty annotation paths
df = pd.DataFrame()
df["slide"] = slide_paths
df["annotation"] = slides_annotations_paths
df["label"] = slide_labels
df["tags"] = ""
return df