Source code for mioXpektron.utils.file_management

# Data import function

import re
from pathlib import Path
from typing import Callable, Dict, Optional, Tuple

import numpy as np
import polars as pl

# Default group patterns: regex → group label.
# Patterns are tested in insertion order; the first match wins.
DEFAULT_GROUP_PATTERNS: Dict[str, str] = {
    r'_CC(?=_|\b)': 'Cancer',
    r'_CT(?=_|\b)': 'Control',
}

MZ_COLUMN_ALIASES = ("m/z", "mz")
INTENSITY_COLUMN_ALIASES = ("corrected_intensity", "Intensity", "intensity")


def _resolve_group(
    sample_name: str,
    group_patterns: Optional[Dict[str, str]],
    group_fn: Optional[Callable[[str], str]],
) -> str:
    """Return a group label for *sample_name*.

    Priority: *group_fn* (if given) > *group_patterns* (if given) >
    ``DEFAULT_GROUP_PATTERNS``.
    """
    if group_fn is not None:
        return group_fn(sample_name)

    patterns = group_patterns if group_patterns is not None else DEFAULT_GROUP_PATTERNS
    for pattern, label in patterns.items():
        if re.search(pattern, sample_name, flags=re.IGNORECASE):
            return label
    return 'Unknown'


def _resolve_column(df: pl.DataFrame, aliases: Tuple[str, ...]) -> str:
    """Return the first matching column name from *aliases*."""
    by_lower = {col.lower(): col for col in df.columns}
    for alias in aliases:
        if alias in df.columns:
            return alias
        match = by_lower.get(alias.lower())
        if match is not None:
            return match
    raise ValueError(f"Missing required columns. Tried aliases: {aliases}")


[docs] def import_data( file_path: str, mz_min: float = None, mz_max: float = None, group_patterns: Optional[Dict[str, str]] = None, group_fn: Optional[Callable[[str], str]] = None, ) -> Tuple[np.ndarray, np.ndarray, str, str]: """ Import ToF-SIMS data from a spectrum file. Parameters ---------- file_path : str Path to the ToF-SIMS data file. Supports tab-delimited ``.txt`` exports with ``m/z`` + ``Intensity`` columns and CSV exports with ``mz`` + ``corrected_intensity`` or ``intensity`` columns. mz_min : float, optional Minimum m/z value to be imported (inclusive). mz_max : float, optional Maximum m/z value to be imported (inclusive). group_patterns : dict[str, str], optional Mapping of ``{regex_pattern: group_label}``. Patterns are tested against the sample name (filename without extension) in order; the first match determines the group. Defaults to ``{'_CC...': 'Cancer', '_CT...': 'Control'}``. group_fn : callable, optional A function ``(sample_name: str) -> str`` that returns the group label directly. When provided this takes priority over *group_patterns*. Returns ------- mz : np.ndarray Mass-to-charge ratio values. intensity : np.ndarray Intensity values. sample_name : str Sample name extracted from file name. group : str Group label derived from the filename. """ path = Path(file_path) separator = "," if path.suffix.lower() == ".csv" else "\t" # Read file, skip lines starting with '#' df = pl.read_csv(file_path, separator=separator, comment_prefix="#") mz_col = _resolve_column(df, MZ_COLUMN_ALIASES) intensity_col = _resolve_column(df, INTENSITY_COLUMN_ALIASES) # Apply m/z filtering (inclusive) if mz_min is not None: df = df.filter(pl.col(mz_col) >= mz_min) if mz_max is not None: df = df.filter(pl.col(mz_col) <= mz_max) if df.height == 0: raise ValueError(f"No data in {file_path} after m/z filtering.") mz = df[mz_col].to_numpy() intensity = df[intensity_col].to_numpy() sample_name = path.stem group = _resolve_group(sample_name, group_patterns, group_fn) return mz, intensity, sample_name, group