Source code for sccloud.io.io

#!/usr/bin/env python

import time
import numpy as np
import pandas as pd
import os.path
from scipy.io import mmread
from scipy.sparse import csr_matrix, issparse
import tables
import gzip

from typing import List, Tuple
from . import Array2D, MemData

import anndata
import logging

logger = logging.getLogger("sccloud")


def load_10x_h5_file_v2(h5_in: "tables.File", fn: str, ngene: int = None) -> "MemData":
    """Load 10x v2 format matrix from hdf5 file

    Parameters
    ----------

    h5_in : tables.File
        An instance of tables.File class that is connected to a 10x v2 formatted hdf5 file.
    fn : `str`
        File name, can be used to indicate channel-specific name prefix.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing genome-Array2D pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v2(h5_in)
    """

    data = MemData()
    for group in h5_in.list_nodes("/", "Group"):
        genome = group._v_name

        M, N = h5_in.get_node("/" + genome + "/shape").read()
        mat = csr_matrix(
            (
                h5_in.get_node("/" + genome + "/data").read(),
                h5_in.get_node("/" + genome + "/indices").read(),
                h5_in.get_node("/" + genome + "/indptr").read(),
            ),
            shape=(N, M),
        )

        barcodes = h5_in.get_node("/" + genome + "/barcodes").read().astype(str)
        ids = h5_in.get_node("/" + genome + "/genes").read().astype(str)
        names = h5_in.get_node("/" + genome + "/gene_names").read().astype(str)

        array2d = Array2D(
            {"barcodekey": barcodes}, {"featurekey": ids, "featurename": names}, mat
        )
        array2d.filter(ngene=ngene)
        array2d.separate_channels(fn)

        data.addData(genome, array2d)

    return data


def load_10x_h5_file_v3(h5_in: "tables.File", fn: str, ngene: int = None) -> "MemData":
    """Load 10x v3 format matrix from hdf5 file

    Parameters
    ----------

    h5_in : tables.File
        An instance of tables.File class that is connected to a 10x v3 formatted hdf5 file.
    fn : `str`
        File name, can be used to indicate channel-specific name prefix.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing genome-Array2D pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v3(h5_in)
    """

    M, N = h5_in.get_node("/matrix/shape").read()
    bigmat = csr_matrix(
        (
            h5_in.get_node("/matrix/data").read(),
            h5_in.get_node("/matrix/indices").read(),
            h5_in.get_node("/matrix/indptr").read(),
        ),
        shape=(N, M),
    )
    barcodes = h5_in.get_node("/matrix/barcodes").read().astype(str)
    genomes = h5_in.get_node("/matrix/features/genome").read().astype(str)
    ids = h5_in.get_node("/matrix/features/id").read().astype(str)
    names = h5_in.get_node("/matrix/features/name").read().astype(str)

    data = MemData()
    for genome in np.unique(genomes):
        idx = genomes == genome

        barcode_metadata = {"barcodekey": barcodes}
        feature_metadata = {"featurekey": ids[idx], "featurename": names[idx]}
        mat = bigmat[:, idx].copy()
        array2d = Array2D(barcode_metadata, feature_metadata, mat)
        array2d.filter(ngene)
        array2d.separate_channels(fn)

        data.addData(genome, array2d)

    return data


def load_10x_h5_file(input_h5: str, ngene: int = None) -> "MemData":
    """Load 10x format matrix (either v2 or v3) from hdf5 file

    Parameters
    ----------

    input_h5 : `str`
        The matrix in 10x v2 or v3 hdf5 format.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing genome-Array2D pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file('example_10x.h5')
    """

    fn = os.path.basename(input_h5)[:-3]

    data = None
    with tables.open_file(input_h5) as h5_in:
        try:
            node = h5_in.get_node("/matrix")
            data = load_10x_h5_file_v3(h5_in, fn, ngene)
        except tables.exceptions.NoSuchNodeError:
            data = load_10x_h5_file_v2(h5_in, fn, ngene)

    return data


def determine_file_name(
    path: str, names: List[str], errmsg: str, fname: str = None, exts: List[str] = None
) -> str:
    """ Try several file name options and determine which one is correct.
    """
    for name in names:
        file_name = os.path.join(path, name)
        if os.path.isfile(file_name):
            return file_name
    if fname is not None:
        for ext in exts:
            file_name = fname + ext
            if os.path.isfile(file_name):
                return file_name
    raise ValueError(errmsg)


def load_one_mtx_file(path: str, ngene: int = None, fname: str = None) -> "Array2D":
    """Load one gene-count matrix in mtx format into an Array2D object
    """
    mtx_file = determine_file_name(
        path,
        ["matrix.mtx.gz", "matrix.mtx"],
        "Expression matrix in mtx format is not found",
        fname=fname,
        exts=[".mtx"],
    )
    mat = csr_matrix(mmread(mtx_file).T)

    barcode_file = determine_file_name(
        path,
        ["cells.tsv.gz", "barcodes.tsv.gz", "barcodes.tsv"],
        "Barcode metadata information is not found",
        fname=fname,
        exts=["_barcode.tsv", ".cells.tsv"],
    )

    feature_file = determine_file_name(
        path,
        ["genes.tsv.gz", "features.tsv.gz", "genes.tsv"],
        "Feature metadata information is not found",
        fname=fname,
        exts=["_gene.tsv", ".genes.tsv"],
    )

    barcode_base = os.path.basename(barcode_file)
    feature_base = os.path.basename(feature_file)

    if barcode_base == "cells.tsv.gz" and feature_base == "genes.tsv.gz":
        format_type = "HCA DCP"
    elif barcode_base == "barcodes.tsv.gz" and feature_base == "features.tsv.gz":
        format_type = "10x v3"
    elif barcode_base == "barcodes.tsv" and feature_base == "genes.tsv":
        format_type = "10x v2"
    elif barcode_base.endswith("_barcode.tsv") and feature_base.endswith("_gene.tsv"):
        format_type = "scumi"
    elif barcode_base.endswith(".cells.tsv") and feature_base.endswith(".genes.tsv"):
        format_type = "dropEst"
    else:
        raise ValueError("Unknown format type")
    if format_type == "HCA DCP":
        barcode_metadata = pd.read_csv(barcode_file, sep="\t", header=0)
        assert "cellkey" in barcode_metadata
        barcode_metadata.rename(columns={"cellkey": "barcodekey"}, inplace=True)

        feature_metadata = pd.read_csv(feature_file, sep="\t", header=0)
    else:
        barcode_metadata = pd.read_csv(
            barcode_file, sep="\t", header=None, names=["barcodekey"]
        )

        if format_type == "10x v3":
            feature_metadata = pd.read_csv(
                feature_file,
                sep="\t",
                header=None,
                names=["featurekey", "featurename", "featuretype"],
            )
        elif format_type == "10x v2":
            feature_metadata = pd.read_csv(
                feature_file, sep="\t", header=None, names=["featurekey", "featurename"]
            )
        elif format_type == "scumi":
            values = (
                pd.read_csv(feature_file, sep="\t", header=None)
                .iloc[:, 0]
                .values.astype(str)
            )
            arr = np.array(np.char.split(values, sep="_", maxsplit=1).tolist())
            feature_metadata = pd.DataFrame(
                data={"featurekey": arr[:, 0], "featurename": arr[:, 1]}
            )
        elif format_type == "dropEst":
            feature_metadata = pd.read_csv(
                feature_file, sep="\t", header=None, names=["featurekey"]
            )
            feature_metadata["featurename"] = feature_metadata["featurekey"]
        else:
            raise ValueError("Unknown format type")

    array2d = Array2D(barcode_metadata, feature_metadata, mat)
    array2d.filter(ngene=ngene)
    if format_type == "10x v3" or format_type == "10x v2":
        array2d.separate_channels("")  # fn == '' refers to 10x mtx format

    return array2d


def load_mtx_file(path: str, genome: str = None, ngene: int = None) -> "MemData":
    """Load gene-count matrix from Market Matrix files (10x v2, v3 and HCA DCP formats)

    Parameters
    ----------

    path : `str`
        Path to mtx files. The directory impiled by path should either contain matrix, feature and barcode information, or folders containg these information.
    genome : `str`, optional (default: None)
        Genome name of the matrix. If None, genome will be inferred from path.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing a genome-Array2D pair.

    Examples
    --------
    >>> io.load_10x_h5_file('example_10x.h5')
    """

    orig_file = None
    if not os.path.isdir(path):
        orig_file = path
        path = os.path.dirname(path)

    data = MemData()
    if (
        os.path.isfile(os.path.join(path, "matrix.mtx.gz"))
        or os.path.isfile(os.path.join(path, "matrix.mtx"))
        or (orig_file is not None and os.path.isfile(orig_file))
    ):
        if genome is None:
            genome = os.path.basename(path)
        data.addData(
            genome,
            load_one_mtx_file(
                path,
                ngene=ngene,
                fname=None if orig_file is None else os.path.splitext(orig_file)[0],
            ),
        )
    else:
        for dir_entry in os.scandir(path):
            if dir_entry.is_dir():
                data.addData(
                    dir_entry.name, load_one_mtx_file(dir_entry.path, ngene=ngene)
                )

    return data


def load_csv_file(
    input_csv: str, genome: str, sep: str = ",", ngene: int = None
) -> "MemData":
    """Load count matrix from a CSV-style file, such as CSV file or DGE style tsv file.

    Parameters
    ----------

    input_csv : `str`
        The CSV file, gzipped or not, containing the count matrix.
    genome : `str`
        The genome reference.
    sep: `str`, optional (default: ',')
        Separator between fields, either ',' or '\t'.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing a genome-Array2D pair.

    Examples
    --------
    >>> io.load_csv_file('example_ADT.csv', genome = 'GRCh38')
    >>> io.load_csv_file('example.umi.dge.txt.gz', genome = 'GRCh38', sep = '\t')
    """

    path = os.path.dirname(input_csv)
    base = os.path.basename(input_csv)
    is_hca_csv = base == "expression.csv"

    if sep == "\t":
        # DGE, columns are cells, which is around thousands and we can use pandas.read_csv
        df = pd.read_csv(input_csv, header=0, index_col=0, sep=sep)
        mat = csr_matrix(df.values.T)
        barcode_metadata = {"barcodekey": df.columns.values}
        feature_metadata = {
            "featurekey": df.index.values,
            "featurename": df.index.values,
        }
    else:
        # For CSV files, wide columns prevent fast pd.read_csv loading
        converter = (
            float if base.startswith("expression") else int
        )  # If expression -> float otherwise int

        barcodes = []
        names = []
        stacks = []
        with (
            gzip.open(input_csv, mode="rt")
            if input_csv.endswith(".gz")
            else open(input_csv)
        ) as fin:
            barcodes = next(fin).strip().split(sep)[1:]
            for line in fin:
                fields = line.strip().split(sep)
                names.append(fields[0])
                stacks.append([converter(x) for x in fields[1:]])

        mat = csr_matrix(np.stack(stacks, axis=1 if not is_hca_csv else 0))
        barcode_metadata = {"barcodekey": barcodes}
        feature_metadata = {"featurekey": names, "featurename": names}

        if is_hca_csv:
            barcode_file = os.path.join(path, "cells.csv")
            if os.path.exists(barcode_file):
                barcode_metadata = pd.read_csv(barcode_file, sep=",", header=0)
                assert "cellkey" in barcode_metadata
                barcode_metadata.rename(columns={"cellkey": "barcodekey"}, inplace=True)

            feature_file = os.path.join(path, "genes.csv")
            if os.path.exists(feature_file):
                feature_metadata = pd.read_csv(feature_file, sep=",", header=0)

    data = MemData()
    array2d = Array2D(barcode_metadata, feature_metadata, mat)
    array2d.filter(ngene=ngene)
    data.addData(genome, array2d)

    return data


def load_loom_file(input_loom: str, genome: str, ngene: int = None) -> "MemData":
    """Load count matrix from a LOOM file. Currently only support HCA DCP Loom spec.

    Parameters
    ----------

    input_loom : `str`
        The LOOM file, containing the count matrix.
    genome : `str`
        The genome reference.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.

    Returns
    -------

    An MemData object containing a genome-Array2D pair.

    Examples
    --------
    >>> io.load_loom_file('example.loom', genome = 'GRCh38', ngene = 200)
    """
    import loompy

    col_trans = {"CellID": "barcodekey"}
    row_trans = {"Accession": "featurekey", "Gene": "featurename"}

    data = MemData()
    with loompy.connect(input_loom) as ds:
        mat = csr_matrix(ds.sparse().T)
        barcode_metadata = {}
        for keyword, values in ds.col_attrs.items():
            keyword = col_trans.get(keyword, keyword)
            barcode_metadata[keyword] = values
        feature_metadata = {}
        for keyword, values in ds.row_attrs.items():
            keyword = row_trans.get(keyword, keyword)
            feature_metadata[keyword] = values

    array2d = Array2D(barcode_metadata, feature_metadata, mat)
    array2d.filter(ngene=ngene)
    data.addData(genome, array2d)

    return data


def load_scCloud_h5_file(
    input_h5: str, ngene: int = None, select_singlets: bool = False
) -> "MemData":
    """Load matrices from sccloud-format hdf5 file

    Parameters
    ----------

    input_h5 : `str`
        sccloud-format hdf5 file.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.
    select_singlets: `bool`, optional (default: False)
        If only load singlets.

    Returns
    -------

    An MemData object containing genome-Array2D pair per genome.

    Examples
    --------
    >>> io.load_scCloud_h5_file('example.sccloud.h5')
    """

    cite_seq_name = None
    selected_barcodes = None

    data = MemData()
    with tables.open_file(input_h5) as h5_in:
        for group in h5_in.list_nodes("/", "Group"):
            genome = group._v_name

            M, N = h5_in.get_node("/" + genome + "/shape").read()
            mat = csr_matrix(
                (
                    h5_in.get_node("/" + genome + "/data").read(),
                    h5_in.get_node("/" + genome + "/indices").read(),
                    h5_in.get_node("/" + genome + "/indptr").read(),
                ),
                shape=(N, M),
            )

            barcode_metadata = {}
            for node in h5_in.walk_nodes("/" + genome + "/_barcodes", "Array"):
                values = node.read()
                if values.dtype.kind == "S":
                    values = values.astype(str)
                barcode_metadata[node.name] = values

            feature_metadata = {}
            for node in h5_in.walk_nodes("/" + genome + "/_features", "Array"):
                values = node.read()
                if values.dtype.kind == "S":
                    values = values.astype(str)
                feature_metadata[node.name] = values

            array2d = Array2D(barcode_metadata, feature_metadata, mat)
            if genome.startswith("CITE_Seq"):
                cite_seq_name = genome
            else:
                array2d.filter(ngene, select_singlets)
                selected_barcodes = array2d.get_metadata("barcodekey")

            data.addData(genome, array2d)

    if (cite_seq_name is not None) and (selected_barcodes is not None):
        array2d = data.getData(cite_seq_name)
        selected = array2d.get_metadata("barcodekey").isin(selected_barcodes)
        array2d.trim(selected)

    return data


def infer_file_format(input_file: str) -> Tuple[str, str, str]:
    """ Infer file format from input_file name

    This function infer file format by inspecting the file name.

    Parameters
    ----------

    input_file : `str`
        Input file name.

    Returns
    -------
    `str`
        File format, choosing from 'sccloud', '10x', 'h5ad', 'mtx', 'dge', and 'csv'.
    `str`
        The path covering all input files. Most time this is the same as input_file. But for HCA mtx and csv, this should be parent directory.
    `str`
        Type of the path, either 'file' or 'directory'.
    """

    file_format = None
    copy_path = input_file
    copy_type = "file"

    if input_file.endswith(".h5"):
        file_format = "10x"
    elif input_file.endswith(".h5sc"):
        file_format = "sccloud"
    elif input_file.endswith(".h5ad"):
        file_format = "h5ad"
    elif input_file.endswith(".loom"):
        file_format = "loom"
    elif (
        input_file.endswith(".mtx")
        or input_file.endswith(".mtx.gz")
        or os.path.splitext(input_file)[1] == ""
    ):
        file_format = "mtx"
        if os.path.splitext(input_file)[1] != "":
            copy_path = os.path.dirname(input_file)
        copy_type = "directory"
    elif input_file.endswith("dge.txt.gz"):
        file_format = "dge"
    elif input_file.endswith(".csv") or input_file.endswith(".csv.gz"):
        file_format = "csv"
        if os.path.basename(input_file) == "expression.csv":
            copy_type = os.path.dirname(input_file)
            copy_type = "directory"
    else:
        raise ValueError("Unrecognized file type for file {}!".format(input_file))

    return file_format, copy_path, copy_type


[docs]def read_input(
    input_file: str,
    genome: str = None,
    return_type: str = "AnnData",
    concat_matrices: bool = False,
    h5ad_mode: str = "a",
    ngene: int = None,
    select_singlets: bool = False,
    channel_attr: str = None,
    black_list: List[str] = [],
) -> "MemData or AnnData or List[AnnData]":
    """Load data into memory.

    This function is used to load input data into memory. Inputs can be in 10x genomics v2 & v3 formats (hdf5 or mtx), HCA DCP mtx and csv formats, Drop-seq dge format, and CSV format.

    Parameters
    ----------

    input_file : `str`
        Input file name.
    genome : `str`, optional (default: None)
        A string contains comma-separated genome names. sccloud will read all matrices matching the genome names. If genomes is None, all matrices will be considered.
    return_type : `str`
        Return object type, can be either 'MemData' or 'AnnData'.
    concat_matrices : `boolean`, optional (default: False)
        If input file contains multiple matrices, if concatenate them into one AnnData object or return a list of AnnData objects.
    h5ad_mode : `str`, optional (default: `a`)
        If input is in h5ad format, the backed mode for loading the data. mode could be 'a', 'r', 'r+'. 'a' refers to load all into memory.
    ngene : `int`, optional (default: None)
        Minimum number of genes to keep a barcode. Default is to keep all barcodes.
    select_singlets : `bool`, optional (default: False)
        If only keep DemuxEM-predicted singlets when loading data.
    channel_attr : `str', optional (default: None)
        Use channel_attr to represent different samples. This will set a 'Channel' column field with channel_attr.
    black_list : `List[str]`, optional (default: [])
        Attributes in black list will be poped out.

    Returns
    -------
    `MemData` object or `anndata` object or a list of `anndata` objects
        An `MemData` object or `anndata` object or a list of `anndata` objects containing the count matrices.

    Examples
    --------
    >>> adata = io.read_input('example_10x.h5', genomes = 'mm10')
    >>> adata = io.read_input('example.h5ad', mode = 'r+')
    >>> adata = io.read_input('example_ADT.csv')
    """

    start = time.time()

    input_file = os.path.expanduser(os.path.expandvars(input_file))
    file_format, _, _ = infer_file_format(input_file)

    if file_format == "sccloud":
        data = load_scCloud_h5_file(
            input_file, ngene=ngene, select_singlets=select_singlets
        )
    elif file_format == "10x":
        data = load_10x_h5_file(input_file, ngene=ngene)
    elif file_format == "h5ad":
        data = anndata.read_h5ad(
            input_file, backed=(None if h5ad_mode == "a" else h5ad_mode)
        )
    elif file_format == "mtx":
        data = load_mtx_file(input_file, genome, ngene=ngene)
    elif file_format == "loom":
        assert genome is not None
        data = load_loom_file(input_file, genome, ngene=ngene)
    else:
        assert (file_format == "dge" or file_format == "csv") and (genome is not None)
        data = load_csv_file(
            input_file, genome, sep=("\t" if file_format == "dge" else ","), ngene=ngene
        )

    if file_format != "h5ad":
        data.restrain_keywords(genome)
        if return_type == "AnnData":
            data = data.convert_to_anndata(concat_matrices=concat_matrices, channel_attr=channel_attr, black_list=black_list)
    else:
        assert (return_type == "AnnData") and (channel_attr is None) and (black_list == [])

    end = time.time()
    logger.info("Read input is finished. Time spent = {:.2f}s.".format(end - start))

    return data


def _parse_whitelist(whitelist: List[str]):
    parse_results = {}
    for value in whitelist:
        tokens = value.split("/")
        curr_dict = parse_results
        for i in range(len(tokens) - 1):
            if tokens[i] not in curr_dict:
                curr_dict[tokens[i]] = dict()
            curr_dict = curr_dict[tokens[i]]
            if curr_dict is None:
                break
        if curr_dict is not None:
            curr_dict[tokens[-1]] = None
    return parse_results


def _update_backed_h5ad(group: "hdf5 group", dat: dict, whitelist: dict):
    import h5py
    from collections.abc import Mapping

    for key, value in dat.items():
        if not isinstance(key, str):
            logging.warning(
                "Dictionary key {} is transformed to str upon writing to h5,"
                "using string keys is recommended".format(key)
            )
            key = str(key)

        if whitelist is None or key in whitelist:
            if isinstance(value, Mapping):
                subgroup = (
                    group[key] if key in group.keys() else group.create_group(key)
                )
                assert isinstance(subgroup, h5py.Group)
                _update_backed_h5ad(
                    subgroup, value, whitelist[key] if whitelist is not None else None
                )
            else:
                if key in group.keys():
                    del group[key] 
                if issparse(value):
                    sparse_mat = group.create_group(key)
                    sparse_mat.attrs["h5sparse_format"] = value.format
                    sparse_mat.attrs["h5sparse_shape"] = np.array(value.shape)
                    sparse_mat.create_dataset("data", data=value.data, compression="gzip")
                    sparse_mat.create_dataset("indices", data=value.indices, compression="gzip")
                    sparse_mat.create_dataset("indptr", data=value.indptr, compression="gzip")
                else:
                    value = np.array(value) if np.ndim(value) > 0 else np.array([value])
                    sdt = h5py.special_dtype(vlen=str)
                    if value.dtype.kind in {"U", "O"} :
                        value = value.astype(sdt)
                    if value.dtype.names is not None:
                        new_dtype = value.dtype.descr
                        convert_type = False
                        for i in range(len(value.dtype)):
                            if value.dtype[i].kind in {"U", "O"}:
                                new_dtype[i] = (new_dtype[i][0], sdt)
                                convert_type = True
                        if convert_type:
                            value = value.astype(new_dtype)
                    group.create_dataset(key, data=value, compression="gzip")


[docs]def write_output(
    data: "MemData or AnnData", output_file: str, whitelist: List = ["obs", "obsm", "uns", "var", "varm"]
) -> None:
    """ Write data back to disk.

    This function is used to write data back to disk.

    Parameters
    ----------

    data : `MemData` or `AnnData`
        data to write back, can be either an MemData or AnnData object.
    output_file : `str`
        output file name. If data is MemData, output_file should ends with suffix '.h5sc'. Otherwise, output_file can end with either '.h5ad' or '.loom'. If output_file ends with '.loom', a LOOM file will be generated. If no suffix is detected, an appropriate one will be appended.
    whitelist : `list`, optional, default = ["obs", "obsm", "uns", "var", "varm"]
        List that indicates changed fields when writing h5ad file in backed mode. For example,
         ['uns/Groups', 'obsm/PCA'] will only write Groups in uns, and PCA in obsm; the rest of the fields will be unchanged.
    Returns
    -------
    None

    Examples
    --------
    >>> io.write_output(adata, 'test.h5ad')
    """

    start = time.time()

    if (not isinstance(data, MemData)) and (not isinstance(data, anndata.AnnData)):
        raise ValueError("data is neither an MemData nor AnnData object!")

    # Identify and correct file suffix
    file_name, _, suffix = output_file.rpartition(".")
    if file_name == "":
        file_name = output_file
        suffix = "h5sc" if isinstance(data, MemData) else "h5ad"
    if isinstance(data, MemData) and suffix != "h5sc":
        logging.warning(
            "Detected file suffix for this MemData object is not .h5sc. We will assume output_file is a file name and append .h5sc suffix."
        )
        file_name = output_file
        suffix = "h5sc"
    if isinstance(data, anndata.AnnData) and (suffix not in ["h5ad", "loom"]):
        logging.warning(
            "Detected file suffix for this AnnData object is neither .h5ad or .loom. We will assume output_file is a file name and append .h5ad suffix."
        )
        file_name = output_file
        suffix = "h5ad"
    output_file = file_name + "." + suffix

    # Eliminate objects starting with fmat_ from uns
    if isinstance(data, anndata.AnnData):
        keys = list(data.uns)
        for keyword in keys:
            if keyword.startswith("fmat_"):
                data.uns.pop(keyword)

    # Write outputs
    if suffix == "h5sc":
        data.write_h5_file(output_file)
    elif suffix == "loom":
        data.write_loom(output_file, write_obsm_varm=True)
    elif not data.isbacked or (data.isbacked and data.file._file.h5f.mode != "r+"):
        data.write(output_file, compression="gzip")
    else:
        assert data.file._file.h5f.mode == "r+"
        import h5py

        h5_file = data.file._file.h5f
        # Fix old h5ad files in which obsm/varm were stored as compound datasets
        for key in ["obsm", "varm"]:
            if key in h5_file.keys() and isinstance(h5_file[key], h5py.Dataset):
                del h5_file[key]
                whitelist.append(key)

        _update_backed_h5ad(
            h5_file, data._to_dict_fixed_width_arrays(), _parse_whitelist(whitelist)
        )
        h5_file.close()

    end = time.time()
    logger.info("Write output is finished. Time spent = {:.2f}s.".format(end - start))