Source code for jipipe.data_slot

# -*- coding: utf-8 -*-

"""
This file provides functions that are used to manage the contents of a data slot

Zoltán Cseresnyés, Ruman Gerst

Research Group Applied Systems Biology - Head: Prof. Dr. Marc Thilo Figge
https://www.leibniz-hki.de/en/applied-systems-biology.html
HKI-Center for Systems Biology of Infection
Leibniz Institute for Natural Product Research and Infection Biology - Hans Knöll Institute (HKI)
Adolf-Reichwein-Straße 23, 07745 Jena, Germany

The project code is licensed under BSD 2-Clause.
See the LICENSE file provided with the code for the full license.
"""

from os import makedirs
from os.path import isdir
from pathlib import Path
import json
import shutil


[docs] class DataSlot: """ Models a JIPipe data slot. """ def __init__(self, data_type: str, storage_path: Path, node_id="", internal_path: Path = "", name: str = "", slot_type: str = ""): """ Initializes a new data slot :param data_type: the JIPipe data type ID that is accepted in this slot. The most basic type is 'jipipe:data' :param slot_type: the type of the slot. Allowed values are 'input' and 'output'. Can be empty. :param storage_path: a directory where the slot data is stored. If it does not exist, one will be created :param node_id: Metadata that indicates the node associated to this slot. Can be empty. :param internal_path: Metadata that indicates the placement of this data within a hierarchy of data. Can be empty. :param name: Name of the slot. Can be empty. """ self.data_type = data_type self.slot_type = slot_type self.storage_path = storage_path self.node_id = node_id self.internal_path = internal_path self.name = name self.rows = 0 self.text_annotations = [] self.data_annotations = [] self.true_data_types = [] if not isdir(storage_path): makedirs(storage_path)
[docs] def get_row_storage_path(self, row: int): """ Returns the storage path for the provided row :param row: the row index :return: the storage path for the row """ return Path(self.storage_path) / Path(str(row))
[docs] def copy_row(self, source_data_slot, source_row: int): """ Copies data from the source slot into this slot :param source_data_slot: the source slot :param source_row: the source row """ target_row = self.add_row(true_data_type=source_data_slot.true_data_types[source_row]) self.text_annotations[target_row] = source_data_slot.text_annotations[source_row].copy() self.data_annotations[target_row] = source_data_slot.data_annotations[source_row].copy() target_row_storage_path = self.get_row_storage_path(target_row) print("Copy " + str(source_data_slot.get_row_storage_path(source_row)) + " to " + str(target_row_storage_path)) shutil.copytree(src=source_data_slot.get_row_storage_path(source_row), dst=target_row_storage_path, dirs_exist_ok=True)
[docs] def add_row(self, n: int = 1, text_annotations: dict = None, data_annotations: dict = None, true_data_type: str = None): """ Adds n rows into the slot :param true_data_type: the true data type ID of this entry. If none, the slot data type is used :param text_annotations: annotations to set for these rows :param data_annotations: annotations to set for these rows :param n: how many rows to add :return: the last row index """ if text_annotations is None: text_annotations = {} if data_annotations is None: data_annotations = {} if true_data_type is None: true_data_type = self.data_type for i in range(n): self.rows += 1 self.true_data_types.append(true_data_type) self.text_annotations.append(text_annotations.copy()) self.data_annotations.append(data_annotations.copy()) p = self.get_row_storage_path(self.rows - 1) if not isdir(p): makedirs(p) return self.rows - 1
[docs] def to_table(self): """ Converts the data slot into a Pandas table.The format is equivalent to data-table.csv generated by JIPipe. :return: Pandas table """ csv_data = { "jipipe:node-id": [self.node_id for row in range(self.rows)], "jipipe:slot": [self.name for row in range(self.rows)], "jipipe:data-type": [self.data_type for row in range(self.rows)], "jipipe:true-data-type": [self.true_data_types[row] for row in range(self.rows)], "jipipe:index": [row for row in range(self.rows)] } text_annotation_columns = set() data_annotation_columns = set() for annotations in self.text_annotations: for name in annotations: text_annotation_columns.add(name) for annotations in self.data_annotations: for name in annotations: data_annotation_columns.add(name) for annotation_column in text_annotation_columns: csv_data[annotation_column] = [self.text_annotations[row].get(annotation_column, "") for row in range(self.rows)] for annotation_column in data_annotation_columns: csv_data["$" + annotation_column] = [str(self.data_annotations[row].get(annotation_column, "")) for row in range(self.rows)] from pandas import DataFrame return DataFrame(data=csv_data)
[docs] def to_dict(self): """ Converts the data slot into a dict that can be serialized into JSON :return: a dictionary that describes this slot """ return { "node-id": self.node_id, "slot": self.name, "internal-path": self.internal_path, "data-type": self.data_type, "rows": [{ "index": row, "true-data-type": self.true_data_types[row], "text-annotations": [{ "name": name, "value": value } for name, value in self.text_annotations[row].items() ], "data-annotations": [{ "name": name, "true-data-type": value["true-data-type"], "row-storage-folder": value["row-storage-folder"] } for name, value in self.data_annotations[row].items() ] } for row in range(self.rows)] }
[docs] def save(self, with_csv=False): """ Saves all metadata related to this slot in the storage folder. This will overwrite data-table.json and data-table.csv (if enabled) :param with_csv: Also write the table as CSV. The format is equivalent to the one generated by JIPipe. Writing with CSV requires pandas to be installed. :return: None """ json_data = self.to_dict() with open(self.storage_path / Path("data-table.json"), "w", encoding="utf-8") as f: json.dump(json_data, f, indent=4, ensure_ascii=False) if with_csv: df = self.to_table() df.to_csv(self.storage_path / Path("data-table.csv"))
[docs] def import_from_folder(storage_path: Path): """ Imports a data slot from a storage path. A valid storage path contains 'data-table.json' and multiple numeric folders. :param storage_path: the storage path :return: a DataSlot instance """ with open(Path(storage_path) / "data-table.json", "r", encoding="utf-8") as f: data_table = json.load(f) data_slot = DataSlot(data_type=data_table["data-type"], storage_path=storage_path, slot_type="", internal_path=storage_path, node_id=data_table.get("node-id", ""), name=data_table.get("slot", "")) rows = data_table.get("rows", []) rows.sort(key=lambda row: row["index"]) for row in rows: text_annotations = {} data_annotations = {} for annotation in row["text-annotations"]: text_annotations[annotation["name"]] = annotation["value"] for annotation in row["data-annotations"]: text_annotations[annotation["name"]] = annotation data_slot.add_row(text_annotations=text_annotations, data_annotations=data_annotations, true_data_type=row["true-data-type"]) return data_slot