import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import shutil
from pathlib import Path
from wildlife_datasets import datasets, analysis
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL.ExifTags import TAGS
from PIL import Image
import os
import exifread
from pymediainfo import MediaInfo
Newt Dataset Preparation
This notebook is used to prepare the newt dataset for training and evaluation.
# Set pandas display options to show all columns and wide output
'display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None) pd.set_option(
download_kaggle_dataset
download_kaggle_dataset (dataset_name, download_path)
"mshahoyi/bar-hill-surveys", "./data/barhill-unprocessed") download_kaggle_dataset(
Dataset already exists at ./data/barhill-unprocessed
'./data/barhill-unprocessed'
= Path("./data/barhill-unprocessed")/'Bar Hill Surveys 2024'
data_root = list(os.walk(data_root))
walk 4] walk[:
[('data/barhill-unprocessed/Bar Hill Surveys 2024',
['Survey 5 06_06_24',
'Survey 7 12_06_24',
'Survey 2 10_05_24',
'Survey 1 09_05_24',
'Survey 6 07_06_24',
'Survey 3 16_05_24',
'Survey 8 13_06_24',
'Survey 4 17_05_24'],
['Bar Hill GCN Survey Results 2024.xlsx',
'ARU Research poster A1 (003) Ecoacoustics researching at Cambridge crematorium_edit (002).png',
'GCN Surveys at Cambridge Crematorium 2024.docx']),
('data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24',
['Pond 4', 'Pond 3', 'Pond 2'],
[]),
('data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4',
['GCN1-P4-S5'],
[]),
('data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5',
[],
['IMG_3558.MOV',
'IMG_3551.MOV',
'IMG_3553.MOV',
'IMG_3555.JPEG',
'IMG_3556.JPEG',
'IMG_3556.MOV',
'IMG_3554.JPEG',
'IMG_3557.JPEG',
'IMG_3552.MOV',
'IMG_3551.JPEG',
'IMG_3557.MOV',
'IMG_3552.JPEG',
'IMG_3555.MOV',
'IMG_3553.JPEG',
'IMG_3554.MOV',
'IMG_3558.JPEG'])]
= [x for x in walk if 'gcn' in os.path.basename(x[0]).lower()]
gcns 2] gcns[:
[('data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5',
[],
['IMG_3558.MOV',
'IMG_3551.MOV',
'IMG_3553.MOV',
'IMG_3555.JPEG',
'IMG_3556.JPEG',
'IMG_3556.MOV',
'IMG_3554.JPEG',
'IMG_3557.JPEG',
'IMG_3552.MOV',
'IMG_3551.JPEG',
'IMG_3557.MOV',
'IMG_3552.JPEG',
'IMG_3555.MOV',
'IMG_3553.JPEG',
'IMG_3554.MOV',
'IMG_3558.JPEG']),
('data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5',
[],
['IMG_3564.JPEG',
'IMG_3565.MOV',
'IMG_3565.JPEG',
'IMG_3563.MOV',
'IMG_3561.MOV',
'IMG_3566.JPEG',
'IMG_3564.MOV',
'IMG_3562.MOV',
'IMG_3561.JPEG',
'IMG_3563.JPEG',
'IMG_3566.MOV',
'IMG_3562.JPEG'])]
= [(os.path.basename(root), list(map(lambda f: os.path.join(root, f), files))) for root, _, files in gcns]
data
dict(data[:2])
{'GCN1-P4-S5': ['data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3558.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3551.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3553.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3555.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3556.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3556.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3554.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3557.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3552.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3551.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3557.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3552.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3555.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3553.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3554.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 4/GCN1-P4-S5/IMG_3558.JPEG'],
'GCN2-P3-S5': ['data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3564.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3565.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3565.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3563.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3561.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3566.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3564.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3562.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3561.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3563.JPEG',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3566.MOV',
'data/barhill-unprocessed/Bar Hill Surveys 2024/Survey 5 06_06_24/Pond 3/GCN2-P3-S5/IMG_3562.JPEG']}
= pd.DataFrame(data).explode(1)
metadata = ["reference_id", "file_path"]
metadata.columns metadata.shape
(2504, 2)
metadata.head()
identity | file_path | |
---|---|---|
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... |
metadata.info()
<class 'pandas.core.frame.DataFrame'>
Index: 2504 entries, 0 to 206
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 identity 2504 non-null object
1 file_path 2504 non-null object
dtypes: object(2)
memory usage: 58.7+ KB
'file_name'] = metadata['file_path'].apply(lambda x: os.path.basename(x))
metadata['is_video'] = metadata['file_name'].apply(lambda x: 'mov' in x.lower())
metadata[ metadata.head()
identity | file_path | file_name | is_video | |
---|---|---|---|---|
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_3558.MOV | True |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_3551.MOV | True |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_3553.MOV | True |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_3555.JPEG | False |
0 | GCN1-P4-S5 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_3556.JPEG | False |
= {reference_id: i+1 for i, reference_id in enumerate(metadata.reference_id.unique())}
mapper 'identity'] = metadata.reference_id.map(mapper)
metadata[ metadata
= Path("./data/gcns-processed")
output_dir =True)
shutil.rmtree(output_dir, ignore_errors=True) Path(output_dir).mkdir(exist_ok
class UnprocessedNewtsDataset(datasets.WildlifeDataset):
def create_catalogue(self) -> pd.DataFrame:
return metadata[~metadata.is_video].rename(columns={"file_name": "image_name", "file_path": "path"})
= UnprocessedNewtsDataset('.')
dataset =(7, 7))
plt.figure(figsize
dataset.plot_grid()/'grid.png') plt.savefig(output_dir
Split the dataset
=(3.5, 3))
plt.figure(figsize"Number of images per identity")
plt.title(
analysis.display_statistics(dataset.df)/'distribution.png') plt.savefig(output_dir
Number of identitites 204
Number of all animals 1252
Number of animals with one image 0
Number of unidentified animals 0
= len(metadata)
n_files = len(metadata[~metadata.is_video])
n_images = len(metadata[metadata.is_video])
n_videos = len(metadata.identity.unique())
n_identities = {
stats "n_files": n_files,
"n_images": n_images,
"n_videos": n_videos,
"n_identities": n_identities
}=[0]).to_csv(output_dir/'statistics.csv', index=False)
pd.DataFrame(stats, index stats
Split: time-unaware disjoint-set
Samples: train/test/unassigned/total = 625/627/0/1252
Classes: train/test/unassigned/total = 110/94/0/204
Samples: train only/test only = 625/627
Classes: train only/test only/joint = 110/94/0
Fraction of train set = 49.92%
Fraction of test set only = 50.08%
Split: time-unaware disjoint-set
Samples: train/test/unassigned/total = 312/315/0/627
Classes: train/test/unassigned/total = 50/44/0/94
Samples: train only/test only = 312/315
Classes: train only/test only/joint = 50/44/0
Fraction of train set = 49.76%
Fraction of test set only = 50.24%
Train: 625, Test: 312, Validation: 315
= metadata.copy().reset_index(drop=True)
metadata_new
for i, row in tqdm(metadata_new.iterrows()):
= Path('newts')/str(row.identity)/row.file_name
new_path /new_path).parent.mkdir(parents=True, exist_ok=True)
Path(output_dir/new_path)
shutil.copy(row.file_path, output_dir'file_path'] = new_path metadata_new.loc[i,
identity | path | image_name | is_video | |
---|---|---|---|---|
0 | GCN1-P1-S2 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_2312.JPEG | False |
1 | GCN1-P1-S2 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_2310.JPEG | False |
2 | GCN1-P1-S2 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_2309.JPEG | False |
3 | GCN1-P1-S2 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_2311.JPEG | False |
4 | GCN1-P1-S2 | data/barhill-unprocessed/Bar Hill Surveys 2024... | IMG_2313.JPEG | False |
Extract the images time of capture
def get_image_creation_date(image_path):
with open(image_path, 'rb') as f:
= exifread.process_file(f, stop_tag="EXIF DateTimeOriginal")
tags = tags.get("EXIF DateTimeOriginal")
date_tag if date_tag:
return pd.to_datetime(str(date_tag), format='%Y:%m:%d %H:%M:%S', utc=True)
return None
def get_video_creation_date(video_path):
= MediaInfo.parse(video_path)
media_info for track in media_info.tracks:
if track.track_type == "General":
= (
date or
track.tagged_date or
track.recorded_date
track.encoded_date
)if date:
return pd.to_datetime(date).floor('s')
return None
def get_creation_date(row):
= os.path.join(output_dir, row.file_path)
file_path = row.is_video
is_video
if is_video: return get_video_creation_date(file_path)
return get_image_creation_date(file_path)
'creation_date'] = pd.NA
metadata_new[
for i, row in tqdm(metadata_new.iterrows(), total=len(metadata_new)):
= get_creation_date(row)
creation_date 'creation_date'] = creation_date metadata_new.at[i,
metadata_new
2503it [00:06, 380.40it/s]
identity | file_path | file_name | is_video | split | |
---|---|---|---|---|---|
0 | GCN1-P4-S5 | test/GCN1-P4-S5/IMG_3558.JPEG | IMG_3558.MOV | True | test |
0 | GCN1-P4-S5 | test/GCN1-P4-S5/IMG_3558.JPEG | IMG_3551.MOV | True | test |
0 | GCN1-P4-S5 | test/GCN1-P4-S5/IMG_3558.JPEG | IMG_3553.MOV | True | test |
0 | GCN1-P4-S5 | test/GCN1-P4-S5/IMG_3558.JPEG | IMG_3555.JPEG | False | test |
0 | GCN1-P4-S5 | test/GCN1-P4-S5/IMG_3558.JPEG | IMG_3556.JPEG | False | test |
/'metadata.csv', index=False)
metadata_new.to_csv(output_dir metadata_new
Upload to Kaggle
upload_to_kaggle
upload_to_kaggle (user_id, title, id, licenses, keywords, dataset_dir)
="mshahoyi",
upload_to_kaggle(user_id="Barhill Great Crested Newts",
titleid="barhill-newts-all",
=[{"name": "CC0-1.0"}],
licenses=["biology", "computer-vision", "animals", "great crested newts"],
keywords="./data/gcns-processed") dataset_dir
import nbdev; nbdev.nbdev_export()