Source code for mioXpektron.denoise.test_denoise_selection

import numpy as np
import pandas as pd

from . import main as denoise_main_api
from .denoise_select import (
    _measure_on_method,
    _measure_one_peak,
    _build_denoising_method_grid,
    aggregate_method_summaries,
    plot_pareto_delta_snr_vs_height,
    rank_methods_pandas,
    select_methods,
)


def _summary_row(method: str, **overrides):
    row = {
        "method": method,
        "peaks_total": 10,
        "peaks_matched": 9,
        "peaks_lost": 1,
        "frac_matched": 0.9,
        "mz_shift_med": 0.0,
        "mz_shift_iqr": 0.1,
        "pct_height_med": -2.0,
        "pct_height_iqr": 1.0,
        "pct_fwhm_med": -3.0,
        "pct_fwhm_iqr": 1.5,
        "pct_area_med": -4.0,
        "pct_area_iqr": 2.0,
        "sigma_raw_global": 2.0,
        "sigma_new_global": 1.0,
        "noise_reduction_db": 6.0,
        "delta_snr_db_med": 4.0,
        "delta_snr_db_iqr": 0.5,
        "hf_power_reduction_db": 3.0,
        "hf_frac_new_global": 0.2,
    }
    row.update(overrides)
    return row


[docs] def test_measure_on_method_rejects_monotonic_shoulder(): x = np.linspace(99.995, 100.005, 101) y_raw = 10.0 * np.exp(-((x - 100.0) / 0.0006) ** 2) ref_idx = int(np.argmax(y_raw)) ref = _measure_one_peak(x, y_raw, ref_idx, prominence=10.0) # Monotonic data inside the search window has no true local maximum, but # the previous implementation still accepted its boundary argmax. y_bad = np.linspace(5.0, 1.0, x.size) match = _measure_on_method(x, y_bad, ref, search_ppm=1000.0) assert match is None
[docs] def test_aggregate_method_summaries_preserves_unique_methods(): summary = pd.DataFrame( [ _summary_row("method_a", peaks_total=10, peaks_matched=8, peaks_lost=2, delta_snr_db_med=3.0), _summary_row("method_a", peaks_total=4, peaks_matched=2, peaks_lost=2, delta_snr_db_med=5.0), _summary_row("method_b", peaks_total=6, peaks_matched=6, peaks_lost=0, delta_snr_db_med=2.0), ] ) rollup = aggregate_method_summaries(summary, unit_label="windows") method_a = rollup.loc[rollup["method"] == "method_a"].iloc[0] assert len(rollup) == 2 assert method_a["windows"] == 2 assert method_a["peaks_total"] == 14 assert method_a["peaks_matched"] == 10 assert np.isclose(method_a["frac_matched"], 10 / 14) assert np.isclose(method_a["delta_snr_db_med"], 4.0)
[docs] def test_build_denoising_method_grid_excludes_derivatives_by_default(): x = np.array([100.0, 100.1, 100.2], dtype=float) default_grid = _build_denoising_method_grid( x, resample_to_uniform=False, target_dx=None, include_derivatives=False, ) derivative_grid = _build_denoising_method_grid( x, resample_to_uniform=False, target_dx=None, include_derivatives=True, ) assert len(default_grid) == 511 assert len(derivative_grid) == 561 assert all(":deriv_0" in name or ":deriv_" not in name for name in default_grid) assert all(":order_0" in name or ":order_" not in name for name in default_grid) assert any(":deriv_1" in name for name in derivative_grid) assert any(":order_1" in name for name in derivative_grid)
[docs] def test_rank_methods_pandas_applies_ppm_normalized_selection_criteria(): summary = pd.DataFrame( [ _summary_row( "method_ok", frac_matched=0.95, mz_shift_med=0.0, mz_shift_iqr=0.001, pct_height_med=8.0, pct_height_iqr=10.0, pct_fwhm_med=6.0, pct_fwhm_iqr=8.0, pct_area_med=7.0, pct_area_iqr=9.0, noise_reduction_db=4.0, delta_snr_db_med=4.0, ), _summary_row( "method_bad_ppm", frac_matched=0.95, mz_shift_med=0.0, mz_shift_iqr=0.020, pct_height_med=8.0, pct_height_iqr=10.0, pct_fwhm_med=6.0, pct_fwhm_iqr=8.0, pct_area_med=7.0, pct_area_iqr=9.0, noise_reduction_db=4.0, delta_snr_db_med=4.0, ), ] ) per_peak = pd.DataFrame([{"mz_ref": 1000.0}, {"mz_ref": 1000.0}]) ranked = rank_methods_pandas(summary, per_peak) ok_row = ranked.loc[ranked["method"] == "method_ok"].iloc[0] bad_row = ranked.loc[ranked["method"] == "method_bad_ppm"].iloc[0] assert np.isclose(ok_row["mz_shift_iqr_ppm"], 1.0) assert np.isclose(bad_row["mz_shift_iqr_ppm"], 20.0) assert bool(ok_row["pass_mz_shift_iqr_ppm"]) assert not bool(bad_row["pass_mz_shift_iqr_ppm"]) assert bool(ok_row["passes_peak_preservation"]) assert not bool(bad_row["passes_peak_preservation"]) assert bool(ok_row["passes_selection_criteria"]) assert not bool(bad_row["passes_selection_criteria"]) assert int(bad_row["failed_criteria_count"]) >= 1 assert ranked.iloc[0]["method"] == "method_ok"
[docs] def test_select_methods_constrained_pareto_then_snr_prefers_highest_snr(): summary = pd.DataFrame( [ { "method": "low_bias", "abs_height": 1.0, "delta_snr_db_med": 5.0, "frac_matched": 0.97, "selection_score": 0.40, "passes_selection_criteria": True, }, { "method": "high_snr", "abs_height": 2.0, "delta_snr_db_med": 7.0, "frac_matched": 0.93, "selection_score": 0.55, "passes_selection_criteria": True, }, { "method": "dominated", "abs_height": 2.5, "delta_snr_db_med": 6.0, "frac_matched": 0.95, "selection_score": 0.35, "passes_selection_criteria": True, }, { "method": "fails_gate", "abs_height": 0.5, "delta_snr_db_med": 9.0, "frac_matched": 0.99, "selection_score": 0.10, "passes_selection_criteria": False, }, ] ) _, frontier, selected = select_methods( summary, basis="constrained_pareto_then_snr", top_k=2, ) assert set(frontier["method"]) == {"low_bias", "high_snr"} assert list(selected["method"]) == ["high_snr", "low_bias"]
[docs] def test_plot_pareto_falls_back_to_all_finite_candidates_when_none_pass(): import matplotlib.pyplot as plt summary = pd.DataFrame( [ { "method": "method_a", "abs_height": 1.0, "delta_snr_db_med": 2.0, "passes_selection_criteria": False, }, { "method": "method_b", "abs_height": 2.0, "delta_snr_db_med": 3.0, "passes_selection_criteria": False, }, ] ) ax = plot_pareto_delta_snr_vs_height( summary, save_plot=False, save_pareto=False, ) assert "No methods passed the current selection criteria" in ax.get_title() plt.close(ax.figure)
[docs] def test_compare_in_windows_ranks_aggregated_rollup(monkeypatch): rollup = pd.DataFrame([_summary_row("method_a")]) window_summary = pd.DataFrame( [ {**_summary_row("method_a"), "window": "[400,410]"}, {**_summary_row("method_a"), "window": "[440,450]"}, ] ) detail = pd.DataFrame([{"method": "method_a", "mz_ref": 100.0}]) captured = {} def fake_compare_methods_in_windows(*args, **kwargs): return rollup, window_summary, detail def fake_rank_method(*, summary_df, per_peak_df, input_format, **kwargs): captured["summary_df"] = summary_df.copy() captured["per_peak_df"] = per_peak_df.copy() captured["input_format"] = input_format return summary_df monkeypatch.setattr(denoise_main_api, "compare_methods_in_windows", fake_compare_methods_in_windows) monkeypatch.setattr(denoise_main_api, "rank_method", fake_rank_method) dm = denoise_main_api.DenoisingMethods(np.array([100.0, 101.0]), np.array([1.0, 2.0])) summary = dm.compare_in_windows(windows=[(400, 410)], save_summary=False) assert len(summary) == 1 assert len(captured["summary_df"]) == 1 assert "window" not in captured["summary_df"].columns assert captured["input_format"] == "pandas" assert len(captured["per_peak_df"]) == 1
[docs] def test_compare_across_files_aggregates_sample_level_summaries(monkeypatch, tmp_path): file_a = tmp_path / "sample_a.txt" file_b = tmp_path / "sample_b.txt" file_a.write_text("") file_b.write_text("") def fake_load_txt_spectrum(path): amp = 1.0 if path.stem.endswith("a") else 2.0 return { "mz": np.array([99.9, 100.0, 100.1], dtype=float), "intensity": np.array([0.0, amp, 0.0], dtype=float), } def fake_compare_denoising_methods(mz, intensity, **kwargs): assert kwargs["n_jobs"] == 3 assert kwargs["parallel_backend"] == "thread" assert kwargs["progress"] is False assert kwargs["include_derivatives"] is True amp = float(intensity[1]) summary = pd.DataFrame( [ _summary_row("method_a", delta_snr_db_med=3.0 + amp, noise_reduction_db=5.0 + amp), _summary_row("method_b", delta_snr_db_med=2.0 + amp, noise_reduction_db=4.0 + amp), ] ) detail = pd.DataFrame( [ {"method": "method_a", "mz_ref": 100.0}, {"method": "method_b", "mz_ref": 100.0}, ] ) return summary, detail def fake_rank_method(*, summary_df, per_peak_df, input_format, **kwargs): assert input_format == "pandas" assert "spectra" in summary_df.columns return summary_df.sort_values("method").reset_index(drop=True) monkeypatch.setattr(denoise_main_api, "load_txt_spectrum", fake_load_txt_spectrum) monkeypatch.setattr(denoise_main_api, "compare_denoising_methods", fake_compare_denoising_methods) monkeypatch.setattr(denoise_main_api, "rank_method", fake_rank_method) ranked, sample_summary_all, detail_all = denoise_main_api.DenoisingMethods.compare_across_files( [file_b, file_a], min_mz=99.5, max_mz=100.5, file_n_jobs=1, method_n_jobs=3, method_parallel_backend="thread", include_derivatives=True, progress=False, save_summary=False, ) assert len(ranked) == 2 assert all(ranked["spectra"] == 2) assert len(sample_summary_all) == 4 assert set(sample_summary_all["sample"]) == {"sample_a", "sample_b"} assert len(detail_all) == 4