Source code for jipipe.data_slot

# -*- coding: utf-8 -*-

"""
This file provides functions that are used to manage the contents of a data slot

Zoltán Cseresnyés, Ruman Gerst

Research Group Applied Systems Biology - Head: Prof. Dr. Marc Thilo Figge
https://www.leibniz-hki.de/en/applied-systems-biology.html
HKI-Center for Systems Biology of Infection
Leibniz Institute for Natural Product Research and Infection Biology - Hans Knöll Institute (HKI)
Adolf-Reichwein-Straße 23, 07745 Jena, Germany

The project code is licensed under BSD 2-Clause.
See the LICENSE file provided with the code for the full license.
"""

from os import makedirs
from os.path import isdir
from pathlib import Path
import json
import shutil



[docs]
class DataSlot:
    """
    Models a JIPipe data slot.
    """

    def __init__(self, data_type: str, storage_path: Path, node_id="", internal_path: Path = "", name: str = "",
                 slot_type: str = ""):
        """
        Initializes a new data slot
        :param data_type: the JIPipe data type ID that is accepted in this slot. The most basic type is 'jipipe:data'
        :param slot_type: the type of the slot. Allowed values are 'input' and 'output'. Can be empty.
        :param storage_path: a directory where the slot data is stored. If it does not exist, one will be created
        :param node_id: Metadata that indicates the node associated to this slot. Can be empty.
        :param internal_path: Metadata that indicates the placement of this data within a hierarchy of data. Can be empty.
        :param name: Name of the slot. Can be empty.
        """
        self.data_type = data_type
        self.slot_type = slot_type
        self.storage_path = storage_path
        self.node_id = node_id
        self.internal_path = internal_path
        self.name = name
        self.rows = 0
        self.text_annotations = []
        self.data_annotations = []
        self.true_data_types = []

        if not isdir(storage_path):
            makedirs(storage_path)


[docs]
    def get_row_storage_path(self, row: int):
        """
        Returns the storage path for the provided row
        :param row: the row index
        :return: the storage path for the row
        """
        return Path(self.storage_path) / Path(str(row))



[docs]
    def copy_row(self, source_data_slot, source_row: int):
        """
        Copies data from the source slot into this slot
        :param source_data_slot: the source slot
        :param source_row: the source row
        """
        target_row = self.add_row(true_data_type=source_data_slot.true_data_types[source_row])
        self.text_annotations[target_row] = source_data_slot.text_annotations[source_row].copy()
        self.data_annotations[target_row] = source_data_slot.data_annotations[source_row].copy()
        target_row_storage_path = self.get_row_storage_path(target_row)
        print("Copy " + str(source_data_slot.get_row_storage_path(source_row)) + " to " + str(target_row_storage_path))
        shutil.copytree(src=source_data_slot.get_row_storage_path(source_row),
                        dst=target_row_storage_path,
                        dirs_exist_ok=True)



[docs]
    def add_row(self, n: int = 1, text_annotations: dict = None, data_annotations: dict = None, true_data_type: str = None):
        """
        Adds n rows into the slot
        :param true_data_type: the true data type ID of this entry. If none, the slot data type is used
        :param text_annotations: annotations to set for these rows
        :param data_annotations: annotations to set for these rows
        :param n: how many rows to add
        :return: the last row index
        """
        if text_annotations is None:
            text_annotations = {}
        if data_annotations is None:
            data_annotations = {}
        if true_data_type is None:
            true_data_type = self.data_type
        for i in range(n):
            self.rows += 1
            self.true_data_types.append(true_data_type)
            self.text_annotations.append(text_annotations.copy())
            self.data_annotations.append(data_annotations.copy())
            p = self.get_row_storage_path(self.rows - 1)
            if not isdir(p):
                makedirs(p)
        return self.rows - 1



[docs]
    def to_table(self):
        """
        Converts the data slot into a Pandas table.The format is equivalent to data-table.csv generated by JIPipe.
        :return: Pandas table
        """
        csv_data = {
            "jipipe:node-id": [self.node_id for row in range(self.rows)],
            "jipipe:slot": [self.name for row in range(self.rows)],
            "jipipe:data-type": [self.data_type for row in range(self.rows)],
            "jipipe:true-data-type": [self.true_data_types[row] for row in range(self.rows)],
            "jipipe:index": [row for row in range(self.rows)]
        }
        text_annotation_columns = set()
        data_annotation_columns = set()
        for annotations in self.text_annotations:
            for name in annotations:
                text_annotation_columns.add(name)
        for annotations in self.data_annotations:
            for name in annotations:
                data_annotation_columns.add(name)
        for annotation_column in text_annotation_columns:
            csv_data[annotation_column] = [self.text_annotations[row].get(annotation_column, "") for row in range(self.rows)]
        for annotation_column in data_annotation_columns:
            csv_data["$" + annotation_column] = [str(self.data_annotations[row].get(annotation_column, "")) for row in range(self.rows)]

        from pandas import DataFrame
        return DataFrame(data=csv_data)



[docs]
    def to_dict(self):
        """
        Converts the data slot into a dict that can be serialized into JSON
        :return: a dictionary that describes this slot
        """
        return {
            "node-id": self.node_id,
            "slot": self.name,
            "internal-path": self.internal_path,
            "data-type": self.data_type,
            "rows": [{
                "index": row,
                "true-data-type": self.true_data_types[row],
                "text-annotations": [{
                        "name": name,
                        "value": value
                    } for name, value in self.text_annotations[row].items()
                ],
                "data-annotations": [{
                        "name": name,
                        "true-data-type": value["true-data-type"],
                        "row-storage-folder": value["row-storage-folder"]
                    } for name, value in self.data_annotations[row].items()
                ]
            } for row in range(self.rows)]
        }



[docs]
    def save(self, with_csv=False):
        """
        Saves all metadata related to this slot in the storage folder.
        This will overwrite data-table.json and data-table.csv (if enabled)
        :param with_csv: Also write the table as CSV. The format is equivalent to the one generated by JIPipe. Writing with CSV requires pandas to be installed.
        :return: None
        """
        json_data = self.to_dict()
        with open(self.storage_path / Path("data-table.json"), "w", encoding="utf-8") as f:
            json.dump(json_data, f, indent=4, ensure_ascii=False)

        if with_csv:
            df = self.to_table()
            df.to_csv(self.storage_path / Path("data-table.csv"))





[docs]
def import_from_folder(storage_path: Path):
    """
    Imports a data slot from a storage path.
    A valid storage path contains 'data-table.json' and multiple numeric folders.
    :param storage_path: the storage path
    :return: a DataSlot instance
    """
    with open(Path(storage_path) / "data-table.json", "r", encoding="utf-8") as f:
        data_table = json.load(f)
        data_slot = DataSlot(data_type=data_table["data-type"],
                             storage_path=storage_path,
                             slot_type="",
                             internal_path=storage_path,
                             node_id=data_table.get("node-id", ""),
                             name=data_table.get("slot", ""))
        rows = data_table.get("rows", [])
        rows.sort(key=lambda row: row["index"])
        for row in rows:
            text_annotations = {}
            data_annotations = {}
            for annotation in row["text-annotations"]:
                text_annotations[annotation["name"]] = annotation["value"]
            for annotation in row["data-annotations"]:
                text_annotations[annotation["name"]] = annotation
            data_slot.add_row(text_annotations=text_annotations, data_annotations=data_annotations, true_data_type=row["true-data-type"])
        return data_slot