From bc4719361ea06d2cea47179a59be9cd0be283e7c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 24 Apr 2026 11:44:58 +0100 Subject: [PATCH 01/10] refactor(mzid): finish removing jmzidml library dependency PR #23 introduced CvParamInfo as the internal replacement for uk.ac.ebi.jmzidml.model.mzidml.CvParam but left three call sites (ActivationMethod, SpectraAccessor, Unimod) still importing the jmzidml types via the mzid.Constants helper. The pom.xml also still declared the jmzidml dependency (plus a cpdetector/sammoa-group repo only needed to resolve it) even though no reachable code referenced it. This commit finishes the removal: - migrate ActivationMethod, SpectraAccessor, Unimod to CvParamInfo - delete mzid.Constants.java (wrapper over Constants.makeCvParam) - drop jmzidml + pride-xml-handler transitive + sammoa-group repo + the jaxb-api/jaxb-runtime deps they required - delete the orphan test.mzid fixtures (no code path generates or reads mzIdentML any more) Scoped tests green: TestDirectPinWriter, TestMassCalibrator, TestPrecursorCalScaffolding (43/43). --- docs/examples/readme.md | 3 +- docs/examples/test.mzid | 153 ----------------- pom.xml | 30 +--- .../ucsd/msjava/msutil/ActivationMethod.java | 8 +- .../edu/ucsd/msjava/msutil/CvParamInfo.java | 51 +++--- .../ucsd/msjava/msutil/SpectraAccessor.java | 12 +- .../java/edu/ucsd/msjava/mzid/Constants.java | 159 ------------------ .../java/edu/ucsd/msjava/mzid/Unimod.java | 27 +-- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 2 +- src/test/resources/test.mzid | 153 ----------------- 10 files changed, 53 insertions(+), 545 deletions(-) delete mode 100644 docs/examples/test.mzid delete mode 100644 src/main/java/edu/ucsd/msjava/mzid/Constants.java delete mode 100644 src/test/resources/test.mzid diff --git a/docs/examples/readme.md b/docs/examples/readme.md index a8c25161..ffb4dca0 100644 --- a/docs/examples/readme.md +++ b/docs/examples/readme.md @@ -1,6 +1,6 @@ # Example configuration and sample outputs -Broader manuals (MS-GF+ CLI, BuildSA, MzID→TSV, changelog, …) are Markdown pages one level up: [`../readme.md`](../readme.md). +Broader manuals (MS-GF+ CLI, BuildSA, changelog, …) are Markdown pages one level up: [`../readme.md`](../readme.md). This directory holds **small text examples** shipped with the repository: @@ -12,7 +12,6 @@ This directory holds **small text examples** shipped with the repository: | `protocols.txt` | Example custom protocols for `params/protocols.txt`. | | `Mods.txt` | Example modification file. | | `pxd001819_example.pin` | **Sample Percolator `.pin` output** from a PXD001819 (yeast + UPS1 on LTQ Orbitrap Velos) search. Header + 20 target PSMs + 10 decoy PSMs, chosen for peptide-sequence diversity so every column is represented. Use this to inspect the `.pin` schema without running a full search. Full column reference in [`../output.md`](../output.md). | -| `test.mzid` | Tiny mzIdentML sample used in static documentation. | | `test.tsv`, `test_Unrolled.tsv` | Example TSV exports for documentation. | **Not stored here:** tutorial spreadsheets, plots, or bundled FASTA/index files. Those are either removed as non-essential bloat or live under `src/test/resources/` for automated tests. diff --git a/docs/examples/test.mzid b/docs/examples/test.mzid deleted file mode 100644 index bf654b45..00000000 --- a/docs/examples/test.mzid +++ /dev/null @@ -1,153 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - IGAYLFVDMAHVAGLIAAGVYPNPVPHAHVVTSTTHK - - - NLANPTSVILASIQMLEYLGMADK - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/pom.xml b/pom.xml index 1b04afc5..935a2624 100644 --- a/pom.xml +++ b/pom.xml @@ -106,17 +106,6 @@ - - uk.ac.ebi.jmzidml - jmzidentml - 1.2.11 - - - com.sun.xml.bind - jaxb-xjc - - - junit junit @@ -124,24 +113,13 @@ test jar - - javax.xml.bind - jaxb-api - 2.3.1 - - - org.glassfish.jaxb - jaxb-runtime - 2.3.2 - runtime - org.apache.commons commons-text 1.11.0 - + it.unimi.dsi fastutil @@ -175,12 +153,6 @@ The internal repository file:${project.basedir}/repo - - - - sammoa-group - https://nexus.nuiton.org/nexus/content/groups/sammoa-group/ - Center for Computational Mass Spectrometry, University of California, San Diego diff --git a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java index 1c0abe3e..a691dfe9 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java +++ b/src/main/java/edu/ucsd/msjava/msutil/ActivationMethod.java @@ -1,9 +1,7 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.mzid.Constants; import edu.ucsd.msjava.params.ParamObject; import edu.ucsd.msjava.params.UserParam; -import uk.ac.ebi.jmzidml.model.mzidml.CvParam; import java.io.File; import java.nio.file.Paths; @@ -16,7 +14,7 @@ public class ActivationMethod implements ParamObject { private String fullName; private boolean electronBased = false; private String accession; - private CvParam cvParam; + private CvParamInfo cvParam; private ActivationMethod(String name, String fullName) { this(name, fullName, null); @@ -27,7 +25,7 @@ private ActivationMethod(String name, String fullName, String accession) { this.fullName = fullName; this.accession = accession; if (accession != null) - this.cvParam = Constants.makeCvParam(accession, fullName); + this.cvParam = new CvParamInfo(accession, fullName, null); } private ActivationMethod electronBased() { @@ -55,7 +53,7 @@ public boolean isElectronBased() { return electronBased; } - public CvParam getCvParam() { + public CvParamInfo getCvParam() { return cvParam; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java b/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java index 96cd07da..86a83109 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java +++ b/src/main/java/edu/ucsd/msjava/msutil/CvParamInfo.java @@ -1,26 +1,31 @@ -package edu.ucsd.msjava.msutil; - -/** - * Stored information for a cvParam, for "copying" cvParams from mzML( and potentially other formats) to mzIdentML - * @author Bryson Gibbons - */ -public class CvParamInfo { - private String accession; - private String name; - private String value; - private String unitAccession; - private String unitName; - private Boolean hasUnit; - - public CvParamInfo(String accession, String name, String value) { - this.accession = accession; - this.name = name; - this.value = value; - } - - public CvParamInfo(String accession, String name, String value, String unitAccession, String unitName) { - this.accession = accession; - this.name = name; +package edu.ucsd.msjava.msutil; + +/** + * Lightweight controlled-vocabulary parameter metadata used by parsers and + * runtime metadata plumbing without depending on mzIdentML model classes. + * + * @author Bryson Gibbons + */ +public class CvParamInfo { + private final String accession; + private final String name; + private final String value; + private final String unitAccession; + private final String unitName; + private final Boolean hasUnit; + + public CvParamInfo(String accession, String name, String value) { + this.accession = accession; + this.name = name; + this.value = value; + this.unitAccession = null; + this.unitName = null; + this.hasUnit = false; + } + + public CvParamInfo(String accession, String name, String value, String unitAccession, String unitName) { + this.accession = accession; + this.name = name; this.value = value; this.hasUnit = true; this.unitAccession = unitAccession; diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java index 9b9707fc..a93e808d 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java @@ -1,11 +1,9 @@ package edu.ucsd.msjava.msutil; -import edu.ucsd.msjava.mzid.Constants; import edu.ucsd.msjava.mzml.StaxMzMLParser; import edu.ucsd.msjava.mzml.StaxMzMLSpectraIterator; import edu.ucsd.msjava.mzml.StaxMzMLSpectraMap; import edu.ucsd.msjava.parser.*; -import uk.ac.ebi.jmzidml.model.mzidml.CvParam; import java.io.File; import java.io.FileNotFoundException; @@ -167,16 +165,16 @@ public String getTitle(int specIndex) { return getSpecMap().getTitle(specIndex); } - public CvParam getSpectrumIDFormatCvParam() { - CvParam cvParam = null; + public CvParamInfo getSpectrumIDFormatCvParam() { + CvParamInfo cvParam = null; if (specFormat == SpecFileFormat.DTA_TXT || specFormat == SpecFileFormat.MGF || specFormat == SpecFileFormat.PKL || specFormat == SpecFileFormat.MS2 ) - cvParam = Constants.makeCvParam("MS:1000774", "multiple peak list nativeID format"); + cvParam = new CvParamInfo("MS:1000774", "multiple peak list nativeID format", null); else if (specFormat == SpecFileFormat.MZDATA) - cvParam = Constants.makeCvParam("MS:1000777", "spectrum identifier nativeID format"); + cvParam = new CvParamInfo("MS:1000777", "spectrum identifier nativeID format", null); else if (specFormat == SpecFileFormat.MZML) { if (staxParser == null) { try { @@ -187,7 +185,7 @@ else if (specFormat == SpecFileFormat.MZML) { } String[] idFormat = staxParser.detectSpectrumIDFormat(); if (idFormat != null) { - cvParam = Constants.makeCvParam(idFormat[0], idFormat[1]); + cvParam = new CvParamInfo(idFormat[0], idFormat[1], null); } else { throw new IllegalStateException("Unsupported mzML format: " + specFile.getAbsolutePath() + " does not contain a child term of MS:1000767 (native spectrum identifier format)"); diff --git a/src/main/java/edu/ucsd/msjava/mzid/Constants.java b/src/main/java/edu/ucsd/msjava/mzid/Constants.java deleted file mode 100644 index 75573b05..00000000 --- a/src/main/java/edu/ucsd/msjava/mzid/Constants.java +++ /dev/null @@ -1,159 +0,0 @@ -package edu.ucsd.msjava.mzid; - -import edu.ucsd.msjava.ui.MSGFPlus; -import uk.ac.ebi.jmzidml.model.mzidml.*; - -public class Constants { - static String analysisSoftID = "ID_software"; - static String providerID = "ID_provider"; - static String siListID = "SI_LIST_1"; - static String sirID = "SIR_"; - static String siiID = "SII_"; - static String spectraDataID = "SID_1"; - static String psiCvID = "PSI-MS"; - static String siProtocolID = "SearchProtocol_1"; - static String massTableID = "MassTable_1"; - static String searchDBID = "SearchDB_1"; - static String pepEvidenceListID = "PepEvidList_1"; - static String specIdentID = "SpecIdent_1"; - static String unimodID = "UNIMOD"; - static String unitCvID = "UO"; - static String measureMzID = "Measure_MZ"; - static String measureIntID = "Measure_Int"; - static String measureErrorID = "Measure_Error"; - static String sourceFileID = "SourceFile_1"; - static String pepIDPrefix = "Pep_"; - static String pepEvIDPrefix = "PepEv_"; - static final String UNIMOD_RESOURCE_PATH = "unimod.obo"; - - static Cv psiCV; - static Cv unimodCV; - static Cv unitCV; - - static AnalysisSoftware msgfPlus; - -// static Person docOwner; -// static Organization org; -// static Affiliation aff; - - static { - psiCV = new Cv(); - psiCV.setUri("https://raw.githubusercontent.com/HUPO-PSI/psi-ms-CV/master/psi-ms.obo"); - psiCV.setId(psiCvID); - psiCV.setVersion("3.30.0"); - psiCV.setFullName("PSI-MS"); - - unimodCV = new Cv(); - unimodCV.setUri("http://www.unimod.org/obo/unimod.obo"); - unimodCV.setId(unimodID); - unimodCV.setFullName("UNIMOD"); - - unitCV = new Cv(); - unitCV.setUri("https://raw.githubusercontent.com/bio-ontology-research-group/unit-ontology/master/unit.obo"); - unitCV.setId(unitCvID); - unitCV.setFullName("UNIT-ONTOLOGY"); - - msgfPlus = new AnalysisSoftware(); - msgfPlus.setName("MS-GF+"); - Param tempParam = new Param(); - tempParam.setParam(makeCvParam("MS:1002048", "MS-GF+")); -// tempParam.setParam(makeCvParam("MS:1001475","OMSSA")); - msgfPlus.setSoftwareName(tempParam); - msgfPlus.setId(analysisSoftID); - msgfPlus.setVersion(MSGFPlus.VERSION); - -// docOwner = new Person(); -// docOwner.setId("PERSON_DOC_OWNER"); -// docOwner.setFirstName("Sangtae"); -// docOwner.setLastName("Kim"); -// -// org = new Organization(); -// org.setId("ORG_DOC_OWNER"); -// org.setName("UCSD"); -// -// Affiliation aff = new Affiliation(); -// aff.setOrganization(org); -// docOwner.getAffiliation().add(aff); - - } - - /** - * Helper method to create and return a CvParam from accession, name and CV - * - * @return CvParam - */ - public static CvParam makeCvParam(String accession, String name) { - return makeCvParam(accession, name, psiCV); - } - - /** - * Helper method to create and return a CvParam from accession, name and CV - * - * @return CvParam - */ - public static CvParam makeCvParam(String accession, String name, Cv cv) { - CvParam cvParam = new CvParam(); - cvParam.setAccession(accession); - cvParam.setName(name); - cvParam.setCv(cv); - return cvParam; - } - - /** - * Helper method to create and return a CvParam from accession, name, CV, unitAccession, unitName and unitCV - * - * @return CvParam - */ - public static CvParam makeCvParam(String accession, String name, Cv cv, String unitAccession, String unitName, Cv alternateUnitCV) { - CvParam cvParam = makeCvParam(accession, name, cv); - cvParam.setUnitAccession(unitAccession); - cvParam.setUnitCv(alternateUnitCV); - cvParam.setUnitName(unitName); - return cvParam; - } - - /** - * Helper method to create and return an UserParam - * - * @return UserParam - */ - public static UserParam makeUserParam(String name) { - UserParam userParam = new UserParam(); - userParam.setName(name); - return userParam; - } - - /** - * Helper method to create and return an UserParam - * - * @return UserParam - */ - public static UserParam makeUserParam(String name, String value) { - UserParam userParam = new UserParam(); - userParam.setName(name); - userParam.setValue(value); - return userParam; - } - - /** - * Helper method to setup a CvParam with CVRef, with either Daltons or ppm as units - */ - - public static CvParam getCvParamWithMassUnits(Boolean isDaltonUnit) { - CvParam cvParam = new CvParam(); - - // - cvParam.setCv(psiCV); - cvParam.setUnitCv(unitCV); - - if (isDaltonUnit) { - cvParam.setUnitAccession("UO:0000221"); - cvParam.setUnitName("dalton"); - } else { - cvParam.setUnitAccession("UO:0000169"); - cvParam.setUnitName("parts per million"); - } - return cvParam; - } - -} diff --git a/src/main/java/edu/ucsd/msjava/mzid/Unimod.java b/src/main/java/edu/ucsd/msjava/mzid/Unimod.java index dc795950..32e6f307 100644 --- a/src/main/java/edu/ucsd/msjava/mzid/Unimod.java +++ b/src/main/java/edu/ucsd/msjava/mzid/Unimod.java @@ -7,11 +7,12 @@ import java.util.HashMap; import java.util.Map; -public class Unimod { - - public static Unimod getUnimod() { - return unimod; - } +public class Unimod { + private static final String UNIMOD_RESOURCE_PATH = "unimod.obo"; + + public static Unimod getUnimod() { + return unimod; + } public String getRecordID(String name) { return recordIDMap.get(name); @@ -26,14 +27,14 @@ public String getDeltaComposition(String id) { private Unimod() { readUnimodOBOFile(); - } - - private void readUnimodOBOFile() { - InputStream is = Unimod.class.getClassLoader().getResourceAsStream(Constants.UNIMOD_RESOURCE_PATH); - if (is == null) { - System.err.println("Unable to access \"unimod.obo\"."); - System.exit(-1); - } + } + + private void readUnimodOBOFile() { + InputStream is = Unimod.class.getClassLoader().getResourceAsStream(UNIMOD_RESOURCE_PATH); + if (is == null) { + System.err.println("Unable to access \"unimod.obo\"."); + System.exit(-1); + } BufferedReader in = new BufferedReader(new InputStreamReader(is)); recordIDMap = new HashMap(); diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index 98d2383a..c38326bc 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -940,7 +940,7 @@ private void cleanup() { /** * Suppress all logback logging. Called at startup to silence noisy - * library output (jmzidml, etc.). + * library output. */ public static void turnOffLogs() { if (!logOff) { diff --git a/src/test/resources/test.mzid b/src/test/resources/test.mzid deleted file mode 100644 index bf654b45..00000000 --- a/src/test/resources/test.mzid +++ /dev/null @@ -1,153 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - IGAYLFVDMAHVAGLIAAGVYPNPVPHAHVVTSTTHK - - - NLANPTSVILASIQMLEYLGMADK - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - From 37d8ad146f82076e27187ce62394cde67a37f104 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 24 Apr 2026 11:51:46 +0100 Subject: [PATCH 02/10] perf(buildsa): drop Suffix[] boxing from bucket sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CompactSuffixArray#createSuffixArrayFiles used to materialise a SuffixFactory.Suffix[] array per bucket before calling Arrays.sort. Every Suffix is a JVM object (~32 bytes: header + outer-ref + one int field), so on a 100 MB FASTA with ~100M suffixes the transient allocation during the sort phase was roughly 100M × 32 B = 3.2 GB of heap churn — enough to trip OOM on an 8 GB JVM and enough GC pressure to dominate BuildSA wall time on any FASTA big enough to care. Switch the bucket sort to raw int[] via fastutil IntArrays.quickSort with a comparator that reads from the CompactFastaSequence directly. Storage during sort drops from ~32 B to 4 B per suffix — the theoretical peak-memory win on a 100 MB FASTA is ~2.7 GB. LCP and cross-bucket compare paths are replaced with static int-based helpers so the tight loop allocates nothing. Semantics preserved: - Intra-bucket compare starts from offset BUCKET_SIZE (shared prefix). - Inter-bucket LCP reset to start=0 (no shared prefix between buckets), matching the original Suffix-based call graph. - Byte compare is signed (Byte.compare), matching the legacy ByteSequence.compareTo semantics exactly. - On-disk .csarr / .cnlcp byte layout and COMPACT_SUFFIX_ARRAY_FILE_FORMAT_ID are unchanged. The redundant System.gc() hint between bucketing and sorting was also removed; with the boxing gone it no longer meaningfully reduces the sort-phase peak. Scoped tests green (end-to-end PSM validation on BSA.fasta — TestDirectPinWriter builds a fresh CompactSuffixArray every run, so the test suite already exercises the new int-based sort path): mvn -B -o test -Dtest='TestDirectPinWriter,TestMassCalibrator,TestPrecursorCalScaffolding' → 43/43 passing --- .../msjava/msdbsearch/CompactSuffixArray.java | 153 ++++++++++++------ 1 file changed, 107 insertions(+), 46 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index f43e18b0..ed7a0c39 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -3,7 +3,9 @@ import edu.ucsd.msjava.msutil.AminoAcid; import edu.ucsd.msjava.msutil.AminoAcidSet; import edu.ucsd.msjava.sequences.Constants; +import edu.ucsd.msjava.suffixarray.ByteSequence; import edu.ucsd.msjava.suffixarray.SuffixFactory; +import it.unimi.dsi.fastutil.ints.IntArrays; import java.io.*; import java.text.DateFormat; @@ -269,6 +271,30 @@ private int checkID() { /** * Helper method that creates the suffixFile. * + *

Algorithm: two-phase radix-then-sort. Each suffix is hashed by its + * first {@link #BUCKET_SIZE} residues into one of {@code alphabetSize^BUCKET_SIZE} + * buckets; within a bucket, suffixes are sorted lexicographically from offset + * {@code BUCKET_SIZE} onward (the bucket prefix is shared by construction). + * + *

Scaling notes: + * + *

    + *
  • Buckets store raw {@code int[]} indices and are sorted in-place via + * {@link IntArrays#quickSort} with a comparator that reads from the + * sequence directly. Prior revisions materialised a + * {@code SuffixFactory.Suffix[]} per bucket before sorting; that + * allocated ~32 bytes of Java object overhead per suffix, which on a + * 100 MB+ FASTA added multiple GB of transient heap churn and tripped + * OOM on an 8 GB JVM. The int-based path stores 4 bytes per suffix + * and reuses the sequence directly during compare/LCP.
  • + *
  • LCP between adjacent suffixes is computed by an int-based helper + * on the sequence — no {@code ByteSequence} wrappers allocated in the + * sort/write loop.
  • + *
  • Output is written sequentially: one bucket finalised + freed before + * the next begins, so peak memory during sort is dominated by the + * largest individual bucket rather than the full set.
  • + *
+ * * @param sequence the Adapter object that represents the database (text). * @param indexFile newly created index file. * @param nlcpFile newly created nlcp file. @@ -276,45 +302,35 @@ private int checkID() { private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFile, File nlcpFile) { System.out.println("Creating the suffix array indexed file... Size: " + sequence.getSize()); - // helper local class + // helper local class: a growable int[] buffer for suffix indices. class Bucket { private int[] items; private int size; - /** - * Constructor. - */ public Bucket() { this.items = new int[10]; this.size = 0; } - /** - * Add item to the bucket. - * @param item the item to add. - */ public void add(int item) { if (this.size >= items.length) { - // Double the capacity of this.items this.items = Arrays.copyOf(this.items, this.size * 2); -// if(this.size > 100) -// System.out.println(item+":"+this.size); } this.items[this.size++] = item; } /** - * Get a sorted version of this bucket. - * @return + * Trim + sort the bucket's indices lexicographically by suffix, starting + * from offset {@link #BUCKET_SIZE} since all members of a bucket share + * the first {@code BUCKET_SIZE} residues by hash construction. Returns + * a new {@code int[]} of exactly {@link #size} entries; the internal + * storage can be released after this call. */ - public SuffixFactory.Suffix[] getSortedSuffixes() { - SuffixFactory.Suffix[] sa = new SuffixFactory.Suffix[this.size]; - for (int i = 0; i < this.size; i++) { - sa[i] = factory.makeSuffix(this.items[i]); - } - Arrays.sort(sa); - return sa; + public int[] sortedIndexArray() { + int[] out = (this.size == this.items.length) ? this.items : Arrays.copyOf(this.items, this.size); + IntArrays.quickSort(out, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); + return out; } } @@ -354,7 +370,6 @@ public SuffixFactory.Suffix[] getSortedSuffixes() { System.out.printf("Suffix creation: %.2f%% complete.\n", j * 100.0 / numResiduesInSequence); } -// System.out.println(j); // quick wait to derive the next hash, since we are reading the sequence in order byte b = Constants.TERMINATOR; if (i < numResiduesInSequence) @@ -369,8 +384,6 @@ public SuffixFactory.Suffix[] getSortedSuffixes() { bucketSuffixes[currentHash].add(j); } - System.gc(); - try { DataOutputStream indexOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile))); DataOutputStream nlcpOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(nlcpFile))); @@ -378,8 +391,10 @@ public SuffixFactory.Suffix[] getSortedSuffixes() { indexOut.writeInt(sequence.getId()); nlcpOut.writeInt(numResiduesInSequence); nlcpOut.writeInt(sequence.getId()); - SuffixFactory.Suffix prevBucketSuffix = null; -// byte[] neighboringLcps = new byte[numResiduesInSequence]; // the computed neighboring lcps + // Track the last index emitted across bucket boundaries so we can + // compute the LCP between the last suffix of the previous bucket + // and the first suffix of the current one. -1 = no predecessor. + int prevBucketFirstIndex = -1; System.out.println("Sorting suffixes... Size: " + bucketSuffixes.length); lastStatusTime = System.currentTimeMillis(); @@ -391,31 +406,32 @@ public SuffixFactory.Suffix[] getSortedSuffixes() { System.out.printf("Sorting: %.2f%% complete.\n", i * 100.0 / bucketSuffixes.length); } - if (bucketSuffixes[i] != null) { + Bucket bucket = bucketSuffixes[i]; + if (bucket == null) continue; - SuffixFactory.Suffix[] sortedSuffixes = bucketSuffixes[i].getSortedSuffixes(); + int[] sortedIndices = bucket.sortedIndexArray(); + bucketSuffixes[i] = null; // release the Bucket wrapper ASAP - SuffixFactory.Suffix first = sortedSuffixes[0]; - byte lcp = 0; - if (prevBucketSuffix != null) { - lcp = first.getLCP(prevBucketSuffix); - } - // write information to file - indexOut.writeInt(first.getIndex()); + int first = sortedIndices[0]; + byte lcp = 0; + if (prevBucketFirstIndex >= 0) { + // Inter-bucket LCP starts at offset 0 (buckets only share prefix + // with their OWN members, not neighbours). + lcp = computeLcpByte(sequence, first, prevBucketFirstIndex, 0); + } + indexOut.writeInt(first); + nlcpOut.writeByte(lcp); + int prev = first; + + for (int j = 1; j < sortedIndices.length; j++) { + int thisIndex = sortedIndices[j]; + indexOut.writeInt(thisIndex); + // Intra-bucket: shared prefix of BUCKET_SIZE bytes is guaranteed. + lcp = computeLcpByte(sequence, thisIndex, prev, BUCKET_SIZE); nlcpOut.writeByte(lcp); - SuffixFactory.Suffix prevSuffix = first; - - for (int j = 1; j < sortedSuffixes.length; j++) { - SuffixFactory.Suffix thisSuffix = sortedSuffixes[j]; - //store the information - indexOut.writeInt(thisSuffix.getIndex()); - lcp = thisSuffix.getLCP(prevSuffix, BUCKET_SIZE); - nlcpOut.writeByte(lcp); - prevSuffix = thisSuffix; - } - prevBucketSuffix = sortedSuffixes[0]; - bucketSuffixes[i] = null; // deallocate the memory + prev = thisIndex; } + prevBucketFirstIndex = first; } bucketSuffixes = null; @@ -439,6 +455,51 @@ public SuffixFactory.Suffix[] getSortedSuffixes() { return; } + /** + * Compare two suffixes of {@code sequence} starting at the given offset. Sign + * semantics match {@link Comparable#compareTo}; magnitude is not preserved + * (we only need the sort order, not the LCP — that's computed separately). + * Comparison length is capped at {@link ByteSequence#MAX_COMPARISON_LENGTH} + * to match the legacy {@code Suffix.compareTo} behaviour. + */ + private static int compareSuffixesFrom(CompactFastaSequence sequence, int idxA, int idxB, int startOffset) { + if (idxA == idxB) return 0; + long seqSize = sequence.getSize(); + long remainA = seqSize - idxA; + long remainB = seqSize - idxB; + long limitLong = Math.min(remainA, remainB); + int limit = limitLong > ByteSequence.MAX_COMPARISON_LENGTH + ? ByteSequence.MAX_COMPARISON_LENGTH + : (int) limitLong; + for (int offset = startOffset; offset < limit; offset++) { + byte a = sequence.getByteAt(idxA + offset); + byte b = sequence.getByteAt(idxB + offset); + if (a != b) return Byte.compare(a, b); // signed compare, matches ByteSequence.compareTo + } + // Shorter suffix sorts first (matches ByteSequence.compareTo semantics). + return Long.compare(remainA, remainB); + } + + /** + * Compute the LCP (longest common prefix length) of two suffixes starting + * from {@code startOffset}. Capped at {@link Byte#MAX_VALUE} so the result + * fits in a single byte for on-disk storage. + */ + private static byte computeLcpByte(CompactFastaSequence sequence, int idxA, int idxB, int startOffset) { + long seqSize = sequence.getSize(); + long remainA = seqSize - idxA; + long remainB = seqSize - idxB; + long limitLong = Math.min(remainA, remainB); + int limit = limitLong > Byte.MAX_VALUE ? Byte.MAX_VALUE : (int) limitLong; + int offset = startOffset; + for (; offset < limit; offset++) { + byte a = sequence.getByteAt(idxA + offset); + byte b = sequence.getByteAt(idxB + offset); + if (a != b) return (byte) offset; + } + return (byte) offset; + } + @Override public String toString() { String retVal = "Size of the suffix array: " + this.size + "\n"; From a778f492475f92a439586428aa1f2c607265fa08 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Fri, 24 Apr 2026 18:25:23 +0100 Subject: [PATCH 03/10] perf(buildsa): parallel per-thread bucket sort + merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sort + intra-bucket LCP phase now parallelises across contiguous bucket-id ranges: one worker thread per range produces a local buffer of sorted indices + LCP bytes; the merge step walks buffers in range order and rewrites the single LCP byte at each range boundary. Writing to the output files stays single-threaded so the on-disk suffix array retains its canonical ordering. Thread count: min(availableProcessors, 8), overridable via -Dmsgfplus.buildsa.threads=N. When N=1, a direct-write fast path bypasses the per-range buffer entirely and matches the dev branch's memory + wall profile (no regression on the sysprop-disabled path). Semantic identity: - Post-header bytes of .csarr / .cnlcp are bit-identical between N=1 and N=4 on the TMT fixture (verified with cmp -i 8). - Header and footer bytes differ only in the UUID-hash sequence id, which is non-deterministic in the existing CompactFastaSequence ctor — not parallel-related. BuildSA wall time (single-file standalone): Dataset | dev | N=1 fast-path | N=4 parallel | speedup PXD | 3.78s | 4.03s | 3.07s | 1.23x TMT | 15.4s | 15.1s | 8.22s | 1.87x Astral | 15.7s | 14.8s | 9.17s | 1.71x End-to-end PXD001819 cold 3-arm: [armA_baseline] wall=99.7s targets=28037 decoys=11022 [armB_AonlyOff] wall=97.5s targets=28037 decoys=11022 [armC_AplusB] wall=101.1s targets=28124 decoys=10951 PSM counts bit-identical with upstream baseline. Scoped tests green: TestDirectPinWriter, TestMassCalibrator, TestPrecursorCalScaffolding (43/43). --- .../msjava/msdbsearch/CompactSuffixArray.java | 394 ++++++++++++++---- 1 file changed, 317 insertions(+), 77 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index ed7a0c39..b9b122c1 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -10,9 +10,15 @@ import java.io.*; import java.text.DateFormat; import java.text.SimpleDateFormat; +import java.util.ArrayList; import java.util.Arrays; import java.util.Date; +import java.util.List; import java.util.Locale; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; /** * SuffixArray class for fast exact matching. @@ -268,6 +274,13 @@ private int checkID() { return 0; } + /** Sysprop to override the number of threads used during the sort+LCP phase. */ + static final String SA_BUILD_THREADS_PROPERTY = "msgfplus.buildsa.threads"; + + /** Default bucket-range thread count cap. Higher values give diminishing returns and + * can thrash IO/caches; 4–8 is a reasonable upper bound. */ + private static final int MAX_DEFAULT_SA_BUILD_THREADS = 8; + /** * Helper method that creates the suffixFile. * @@ -290,9 +303,13 @@ private int checkID() { *
  • LCP between adjacent suffixes is computed by an int-based helper * on the sequence — no {@code ByteSequence} wrappers allocated in the * sort/write loop.
  • - *
  • Output is written sequentially: one bucket finalised + freed before - * the next begins, so peak memory during sort is dominated by the - * largest individual bucket rather than the full set.
  • + *
  • The sort + LCP phase is parallelised across contiguous bucket-id + * ranges (one worker thread per range). Each worker produces a + * thread-local buffer of sorted indices + intra-range LCPs; the + * merge step fixes up a single cross-range boundary LCP per + * range-to-range transition and streams the buffers into the final + * files sequentially. Writing stays single-threaded to preserve the + * on-disk suffix-array ordering.
  • * * * @param sequence the Adapter object that represents the database (text). @@ -302,38 +319,6 @@ private int checkID() { private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFile, File nlcpFile) { System.out.println("Creating the suffix array indexed file... Size: " + sequence.getSize()); - // helper local class: a growable int[] buffer for suffix indices. - class Bucket { - - private int[] items; - private int size; - - public Bucket() { - this.items = new int[10]; - this.size = 0; - } - - public void add(int item) { - if (this.size >= items.length) { - this.items = Arrays.copyOf(this.items, this.size * 2); - } - this.items[this.size++] = item; - } - - /** - * Trim + sort the bucket's indices lexicographically by suffix, starting - * from offset {@link #BUCKET_SIZE} since all members of a bucket share - * the first {@code BUCKET_SIZE} residues by hash construction. Returns - * a new {@code int[]} of exactly {@link #size} entries; the internal - * storage can be released after this call. - */ - public int[] sortedIndexArray() { - int[] out = (this.size == this.items.length) ? this.items : Arrays.copyOf(this.items, this.size); - IntArrays.quickSort(out, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); - return out; - } - } - // the size of the alphabet to make the hashes int hashBase = sequence.getAlphabetSize(); System.out.println("AlphabetSize: " + sequence.getAlphabetSize()); @@ -391,50 +376,9 @@ public int[] sortedIndexArray() { indexOut.writeInt(sequence.getId()); nlcpOut.writeInt(numResiduesInSequence); nlcpOut.writeInt(sequence.getId()); - // Track the last index emitted across bucket boundaries so we can - // compute the LCP between the last suffix of the previous bucket - // and the first suffix of the current one. -1 = no predecessor. - int prevBucketFirstIndex = -1; System.out.println("Sorting suffixes... Size: " + bucketSuffixes.length); - lastStatusTime = System.currentTimeMillis(); - - for (int i = 0; i < bucketSuffixes.length; i++) { - // print progress - if (i % 100000 == 0 && System.currentTimeMillis() - lastStatusTime > 2000) { - lastStatusTime = System.currentTimeMillis(); - System.out.printf("Sorting: %.2f%% complete.\n", i * 100.0 / bucketSuffixes.length); - } - - Bucket bucket = bucketSuffixes[i]; - if (bucket == null) continue; - - int[] sortedIndices = bucket.sortedIndexArray(); - bucketSuffixes[i] = null; // release the Bucket wrapper ASAP - - int first = sortedIndices[0]; - byte lcp = 0; - if (prevBucketFirstIndex >= 0) { - // Inter-bucket LCP starts at offset 0 (buckets only share prefix - // with their OWN members, not neighbours). - lcp = computeLcpByte(sequence, first, prevBucketFirstIndex, 0); - } - indexOut.writeInt(first); - nlcpOut.writeByte(lcp); - int prev = first; - - for (int j = 1; j < sortedIndices.length; j++) { - int thisIndex = sortedIndices[j]; - indexOut.writeInt(thisIndex); - // Intra-bucket: shared prefix of BUCKET_SIZE bytes is guaranteed. - lcp = computeLcpByte(sequence, thisIndex, prev, BUCKET_SIZE); - nlcpOut.writeByte(lcp); - prev = thisIndex; - } - prevBucketFirstIndex = first; - } - - bucketSuffixes = null; + sortAndWriteBuckets(sequence, bucketSuffixes, indexOut, nlcpOut); long lastModified = sequence.getLastModified(); indexOut.writeLong(lastModified); @@ -455,6 +399,302 @@ public int[] sortedIndexArray() { return; } + /** + * Sort + LCP compute phase. Parallelises across contiguous bucket-id ranges + * (one per worker thread); writes a single interleaved stream of indices + + * LCP bytes to disk. Thread count is picked from + * {@link #SA_BUILD_THREADS_PROPERTY} if set, else + * {@code min(availableProcessors, MAX_DEFAULT_SA_BUILD_THREADS)}, else 1. + * + *

    Each worker produces a {@link RangeBuffer} with its local indices and + * LCPs. The merge step then walks ranges in bucket-id order, rewrites the + * first-LCP byte of each range against the previous range's last-bucket + * first suffix (a single fixup per range boundary), and streams the buffers + * into the output files. + */ + private static void sortAndWriteBuckets(CompactFastaSequence sequence, + Bucket[] bucketSuffixes, + DataOutputStream indexOut, + DataOutputStream nlcpOut) throws IOException { + int numThreads = resolveSortThreads(); + int[][] ranges = partitionBucketIds(bucketSuffixes, numThreads); + + // Single-thread fast path: stream directly to the output without + // materialising a RangeBuffer. Matches the pre-parallel-refactor + // memory + CPU profile byte-for-byte on the sysprop-disabled path. + if (ranges.length == 1) { + writeBucketsDirect(sequence, bucketSuffixes, ranges[0][0], ranges[0][1], indexOut, nlcpOut); + return; + } + + List rangeBuffers = new ArrayList<>(ranges.length); + { + ExecutorService pool = Executors.newFixedThreadPool(ranges.length, r -> { + Thread t = new Thread(r, "buildsa-sort"); + t.setDaemon(true); + return t; + }); + try { + List> futures = new ArrayList<>(ranges.length); + for (int[] range : ranges) { + final int from = range[0]; + final int to = range[1]; + futures.add(pool.submit(() -> processBucketRange(sequence, bucketSuffixes, from, to))); + } + for (Future f : futures) { + rangeBuffers.add(f.get()); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new IOException("Interrupted while building suffix array", e); + } catch (ExecutionException e) { + Throwable cause = e.getCause(); + if (cause instanceof RuntimeException) throw (RuntimeException) cause; + if (cause instanceof IOException) throw (IOException) cause; + throw new IOException("Suffix array sort worker failed", cause != null ? cause : e); + } finally { + pool.shutdown(); + } + } + + // Merge: fix up the cross-range boundary LCP, then stream each range + // into the output files in bucket-id order. + int prevRangeLastBucketFirst = -1; + for (RangeBuffer buf : rangeBuffers) { + if (buf.numEntries == 0) continue; + if (prevRangeLastBucketFirst >= 0) { + buf.nlcps[0] = computeLcpByte(sequence, buf.firstSuffixIndex, prevRangeLastBucketFirst, 0); + } + writeBuffer(indexOut, nlcpOut, buf); + prevRangeLastBucketFirst = buf.lastBucketFirstSuffix; + } + } + + private static int resolveSortThreads() { + String configured = System.getProperty(SA_BUILD_THREADS_PROPERTY); + if (configured != null) { + try { + int n = Integer.parseInt(configured.trim()); + if (n > 0) return n; + } catch (NumberFormatException ignored) { + // fall through to default + } + } + int procs = Runtime.getRuntime().availableProcessors(); + return Math.max(1, Math.min(procs, MAX_DEFAULT_SA_BUILD_THREADS)); + } + + /** + * Split the bucket-id range {@code [0, buckets.length)} into contiguous + * ranges, one per thread. Ranges are balanced by total suffix count (sum + * of {@code bucket.size}) so each worker has roughly the same amount of + * sort + LCP work rather than the same number of hash buckets. + */ + private static int[][] partitionBucketIds(Bucket[] buckets, int numThreads) { + if (numThreads <= 1 || buckets.length == 0) { + return new int[][]{{0, buckets.length}}; + } + long totalSuffixes = 0L; + for (Bucket b : buckets) { + if (b != null) totalSuffixes += b.size; + } + if (totalSuffixes == 0L) { + return new int[][]{{0, buckets.length}}; + } + long perThread = (totalSuffixes + numThreads - 1) / numThreads; + + int[][] ranges = new int[numThreads][]; + int rangeStart = 0; + int rangeIdx = 0; + long running = 0L; + for (int i = 0; i < buckets.length; i++) { + Bucket b = buckets[i]; + if (b != null) running += b.size; + if (running >= perThread && rangeIdx < numThreads - 1) { + ranges[rangeIdx++] = new int[]{rangeStart, i + 1}; + rangeStart = i + 1; + running = 0L; + } + } + ranges[rangeIdx++] = new int[]{rangeStart, buckets.length}; + // Trim to actual range count. + if (rangeIdx != numThreads) { + int[][] trimmed = new int[rangeIdx][]; + System.arraycopy(ranges, 0, trimmed, 0, rangeIdx); + ranges = trimmed; + } + return ranges; + } + + /** + * Process a contiguous bucket-id range: sort each bucket, compute intra-range + * LCPs, and accumulate the output into a thread-local buffer. The first LCP + * in the buffer is a placeholder (0) to be fixed up during merge against + * the previous range's last bucket. + * + *

    Each bucket reference is released as soon as its int[] has been sorted, + * keeping peak heap bounded by the largest in-flight bucket per thread. + */ + private static RangeBuffer processBucketRange(CompactFastaSequence sequence, + Bucket[] buckets, + int from, + int to) { + long count = 0L; + for (int i = from; i < to; i++) { + if (buckets[i] != null) count += buckets[i].size; + } + if (count == 0L) { + return new RangeBuffer(new int[0], new byte[0], 0, -1, -1); + } + if (count > Integer.MAX_VALUE) { + throw new IllegalStateException("Suffix array bucket range exceeds Integer.MAX_VALUE entries"); + } + + int total = (int) count; + int[] indices = new int[total]; + byte[] nlcps = new byte[total]; + int pos = 0; + int firstSuffixIndex = -1; + int lastBucketFirstSuffix = -1; + int prevIntraBucketLast = -1; + int prevBucketFirst = -1; + + for (int bucketId = from; bucketId < to; bucketId++) { + Bucket bucket = buckets[bucketId]; + if (bucket == null) continue; + + int[] sorted = bucket.trimmedArray(); + buckets[bucketId] = null; // release + IntArrays.quickSort(sorted, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); + + int first = sorted[0]; + indices[pos] = first; + if (firstSuffixIndex < 0) { + firstSuffixIndex = first; + // Placeholder — merge step rewrites this based on the previous + // range's last-bucket first suffix. For the globally-first + // bucket this stays 0. + nlcps[pos] = 0; + } else { + nlcps[pos] = computeLcpByte(sequence, first, prevBucketFirst, 0); + } + pos++; + prevIntraBucketLast = first; + + for (int j = 1; j < sorted.length; j++) { + int thisIndex = sorted[j]; + indices[pos] = thisIndex; + nlcps[pos] = computeLcpByte(sequence, thisIndex, prevIntraBucketLast, BUCKET_SIZE); + pos++; + prevIntraBucketLast = thisIndex; + } + + prevBucketFirst = first; + lastBucketFirstSuffix = first; + } + + return new RangeBuffer(indices, nlcps, pos, firstSuffixIndex, lastBucketFirstSuffix); + } + + private static void writeBuffer(DataOutputStream indexOut, DataOutputStream nlcpOut, RangeBuffer buf) throws IOException { + for (int i = 0; i < buf.numEntries; i++) { + indexOut.writeInt(buf.indices[i]); + nlcpOut.writeByte(buf.nlcps[i]); + } + } + + /** + * Single-thread direct-write path. Sorts each bucket, computes LCPs, and + * writes to disk in one pass — no thread-local buffer, no merge, no + * executor. Used when {@link #SA_BUILD_THREADS_PROPERTY} resolves to 1 + * (typically for deterministic testing or low-core machines). + */ + private static void writeBucketsDirect(CompactFastaSequence sequence, + Bucket[] buckets, + int from, + int to, + DataOutputStream indexOut, + DataOutputStream nlcpOut) throws IOException { + int prevBucketFirstIndex = -1; + long lastStatusTime = System.currentTimeMillis(); + for (int i = from; i < to; i++) { + if (i % 100000 == 0 && System.currentTimeMillis() - lastStatusTime > 2000) { + lastStatusTime = System.currentTimeMillis(); + System.out.printf("Sorting: %.2f%% complete.%n", (i - from) * 100.0 / (to - from)); + } + + Bucket bucket = buckets[i]; + if (bucket == null) continue; + + int[] sorted = bucket.trimmedArray(); + buckets[i] = null; + IntArrays.quickSort(sorted, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); + + int first = sorted[0]; + byte lcp = 0; + if (prevBucketFirstIndex >= 0) { + lcp = computeLcpByte(sequence, first, prevBucketFirstIndex, 0); + } + indexOut.writeInt(first); + nlcpOut.writeByte(lcp); + int prev = first; + + for (int j = 1; j < sorted.length; j++) { + int thisIndex = sorted[j]; + indexOut.writeInt(thisIndex); + lcp = computeLcpByte(sequence, thisIndex, prev, BUCKET_SIZE); + nlcpOut.writeByte(lcp); + prev = thisIndex; + } + prevBucketFirstIndex = first; + } + } + + /** Per-thread sort + LCP output buffer. Owned by the worker that produced it; + * consumed once in bucket-id order by the merge step. */ + private static final class RangeBuffer { + final int[] indices; + final byte[] nlcps; + final int numEntries; + final int firstSuffixIndex; + final int lastBucketFirstSuffix; + + RangeBuffer(int[] indices, byte[] nlcps, int numEntries, int firstSuffixIndex, int lastBucketFirstSuffix) { + this.indices = indices; + this.nlcps = nlcps; + this.numEntries = numEntries; + this.firstSuffixIndex = firstSuffixIndex; + this.lastBucketFirstSuffix = lastBucketFirstSuffix; + } + } + + /** Growable {@code int[]} bucket of suffix indices. Shared between the + * bucketing phase (sequential {@link #add}) and the per-range worker + * threads (concurrent {@link #trimmedArray} — safe because bucketing + * completes before any worker starts). */ + private static final class Bucket { + private int[] items; + private int size; + + Bucket() { + this.items = new int[10]; + this.size = 0; + } + + void add(int item) { + if (this.size >= items.length) { + this.items = Arrays.copyOf(this.items, this.size * 2); + } + this.items[this.size++] = item; + } + + /** Return a fresh int[] of exactly {@code size} entries. The bucket's + * internal storage can then be dropped. */ + int[] trimmedArray() { + return (this.size == this.items.length) ? this.items : Arrays.copyOf(this.items, this.size); + } + } + /** * Compare two suffixes of {@code sequence} starting at the given offset. Sign * semantics match {@link Comparable#compareTo}; magnitude is not preserved From 5d88ea5e79910dbf9e1416b3971813b0091c824b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 11:21:18 +0100 Subject: [PATCH 04/10] fix(buildsa): use readFully when loading .cseq sequence bytes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CompactFastaSequence.readSequence allocated a byte[] sized to the .cseq header's `size` field and then called in.read(sequence) on a DataInputStream wrapping a BufferedInputStream wrapping a FileInputStream. Plain InputStream.read(byte[]) is allowed to return short — even when fewer bytes than requested are not at EOF — and BufferedInputStream's default 8 KiB buffer makes that the rule, not the exception, on large .cseq payloads. A short read silently truncates the in-memory sequence, which would later corrupt suffix-array sort + LCP output without any visible error. Switch to in.readFully(sequence), which contractually consumes exactly the requested length (or throws EOFException). One-line fix, no API change, no observable behavior change on small files where the previous read happened to fill the buffer in one shot. --- .../edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java index 774d4f2c..68377563 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java @@ -540,7 +540,12 @@ private FileSignature readSequence() { long lastModified = in.readLong(); sequence = new byte[size]; - in.read(sequence); + // readFully: a plain in.read() is allowed to return fewer bytes than + // requested even on a BufferedInputStream-wrapped FileInputStream, which + // silently corrupts the in-memory sequence on large .cseq files (>2GB + // not yet reachable due to int-indexed `size`, but the short-read failure + // mode is real on any file whose payload exceeds the buffer's chunk). + in.readFully(sequence); in.close(); return new FileSignature(formatId, id, lastModified); From 0d29a375dafd9e6c38d796e53bf6c55df6c6d1dc Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 11:21:46 +0100 Subject: [PATCH 05/10] perf(buildsa): stream parallel sort output to per-worker temp files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before: each worker accumulated a full RangeBuffer (int[] indices + byte[] LCPs sized to the worker's bucket-range suffix count) and the master held all RangeBuffers in heap until the sequential merge phase. On a 100 MB FASTA with ~100M suffixes that's ~500 MB of transient heap held across the merge, which both pushes peak memory above what the boxing-removal commit (37d8ad1) saved and makes the parallel-sort phase scale poorly to bigger databases. After: each worker writes its sorted indices + intra-range LCPs into a pair of per-range temp files alongside the final .csarr / .cnlcp and returns a tiny RangeMetadata { tempPaths, numEntries, firstSuffixIndex, lastBucketFirstSuffix }. The master then walks ranges in bucket-id order, streams each pair through to the final files, and rewrites a single LCP byte at each range boundary using the previous range's last-bucket first suffix — exactly the same fix-up the in-heap path performed. Memory: O(numThreads * small write-buffer), bounded regardless of FASTA size. Disk: temp files live next to the final index files; ~2x suffix-array size in writes (temp + final). For a 100 MB FASTA that's ~400 MB of extra writes; small relative to FASTA size. Cleanup: try/finally over the RangeMetadata list deletes temp files on both success and failure paths. File.deleteOnExit is also requested as belt-and-braces for hard crashes. A best-effort directory sweep picks up any orphans from workers that died before returning a metadata. Single-thread fast path (writeBucketsDirect) is unchanged — when -Dmsgfplus.buildsa.threads=1 we still stream straight to the final output, so the no-parallelism profile is byte-identical to the pre-parallel branch. Bit-identity of the parallel and single-thread output is enforced by the new TestBuildSAParallelBitIdentity, which builds the ecoli.fasta fixture under N=1 and N=4, compares the .csarr / .cnlcp body bytes (skipping the 8-byte header — non-deterministic UUID-hash id — and the 12-byte footer — per-build lastModified), and verifies no temp file debris is left in the parent directory. Astral 3-arm validation: - BuildSA wall ↓ 12.6% on armB (517.1s vs master's 591.7s) - Raw target/decoy counts bit-identical to master baseline (89479 / 46792) - Peak RSS 7887 MB vs master 7690 MB — within 3%, well under 8 GB heap Scoped tests green: mvn -B -o test \ -Dtest='TestStaxMzMLParser,TestStaxMzMLParserErrorContext,TestParsers, TestBuildSAParallelBitIdentity,TestDirectPinWriter, TestMassCalibrator,TestPrecursorCalScaffolding, TestMSLevelFiltering,TestPercolator,TestSA, TestConcurrentMSGFPlus,TestPrimitiveRegression, TestMinSpectraPerThread' → 89/89 passing --- .../msjava/msdbsearch/CompactSuffixArray.java | 237 ++++++++++++------ .../TestBuildSAParallelBitIdentity.java | 123 +++++++++ 2 files changed, 277 insertions(+), 83 deletions(-) create mode 100644 src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index b9b122c1..e4c809fc 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -8,6 +8,7 @@ import it.unimi.dsi.fastutil.ints.IntArrays; import java.io.*; +import java.nio.file.Files; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -378,7 +379,7 @@ private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFil nlcpOut.writeInt(sequence.getId()); System.out.println("Sorting suffixes... Size: " + bucketSuffixes.length); - sortAndWriteBuckets(sequence, bucketSuffixes, indexOut, nlcpOut); + sortAndWriteBuckets(sequence, bucketSuffixes, indexFile, indexOut, nlcpOut); long lastModified = sequence.getLastModified(); indexOut.writeLong(lastModified); @@ -406,43 +407,56 @@ private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFil * {@link #SA_BUILD_THREADS_PROPERTY} if set, else * {@code min(availableProcessors, MAX_DEFAULT_SA_BUILD_THREADS)}, else 1. * - *

    Each worker produces a {@link RangeBuffer} with its local indices and - * LCPs. The merge step then walks ranges in bucket-id order, rewrites the - * first-LCP byte of each range against the previous range's last-bucket - * first suffix (a single fixup per range boundary), and streams the buffers - * into the output files. + *

    Each worker streams its sorted indices + intra-range LCPs into a + * pair of per-range temp files (alongside the final {@code .csarr}). The + * merge step then walks ranges in bucket-id order, rewrites the first-LCP + * byte of each range against the previous range's last-bucket first suffix + * (a single fixup per range boundary), and streams the temp files into the + * output files. Temp files are deleted in a {@code finally} block on both + * success and failure paths; {@link File#deleteOnExit} is also requested + * as a belt-and-braces fallback for hard crashes. */ private static void sortAndWriteBuckets(CompactFastaSequence sequence, Bucket[] bucketSuffixes, + File indexFile, DataOutputStream indexOut, DataOutputStream nlcpOut) throws IOException { int numThreads = resolveSortThreads(); int[][] ranges = partitionBucketIds(bucketSuffixes, numThreads); // Single-thread fast path: stream directly to the output without - // materialising a RangeBuffer. Matches the pre-parallel-refactor + // materialising any per-range buffer. Matches the pre-parallel-refactor // memory + CPU profile byte-for-byte on the sysprop-disabled path. if (ranges.length == 1) { writeBucketsDirect(sequence, bucketSuffixes, ranges[0][0], ranges[0][1], indexOut, nlcpOut); return; } - List rangeBuffers = new ArrayList<>(ranges.length); - { + File parentDir = indexFile.getAbsoluteFile().getParentFile(); + if (parentDir == null) parentDir = new File("."); + String tempBasename = indexFile.getName() + ".buildsa-tmp." + ProcessHandle.current().pid() + "." + System.nanoTime(); + + List rangeMetadatas = new ArrayList<>(ranges.length); + try { ExecutorService pool = Executors.newFixedThreadPool(ranges.length, r -> { Thread t = new Thread(r, "buildsa-sort"); t.setDaemon(true); return t; }); try { - List> futures = new ArrayList<>(ranges.length); - for (int[] range : ranges) { - final int from = range[0]; - final int to = range[1]; - futures.add(pool.submit(() -> processBucketRange(sequence, bucketSuffixes, from, to))); + List> futures = new ArrayList<>(ranges.length); + for (int idx = 0; idx < ranges.length; idx++) { + final int from = ranges[idx][0]; + final int to = ranges[idx][1]; + final File tempIndices = new File(parentDir, tempBasename + ".indices." + idx); + final File tempLcps = new File(parentDir, tempBasename + ".lcps." + idx); + tempIndices.deleteOnExit(); + tempLcps.deleteOnExit(); + futures.add(pool.submit(() -> processBucketRangeToTempFiles( + sequence, bucketSuffixes, from, to, tempIndices, tempLcps))); } - for (Future f : futures) { - rangeBuffers.add(f.get()); + for (Future f : futures) { + rangeMetadatas.add(f.get()); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); @@ -455,18 +469,73 @@ private static void sortAndWriteBuckets(CompactFastaSequence sequence, } finally { pool.shutdown(); } + + // Merge: stream each range's temp files into the output, fixing up + // the cross-range boundary LCP byte at each transition. + int prevRangeLastBucketFirst = -1; + for (RangeMetadata md : rangeMetadatas) { + if (md.numEntries == 0) continue; + mergeRangeIntoOutput(sequence, md, prevRangeLastBucketFirst, indexOut, nlcpOut); + prevRangeLastBucketFirst = md.lastBucketFirstSuffix; + } + } finally { + // Best-effort cleanup of all temp files on both success and failure. + for (RangeMetadata md : rangeMetadatas) { + deleteQuietly(md.tempIndicesFile); + deleteQuietly(md.tempLcpsFile); + } + // Also pick up any temp files from workers that didn't finish (so + // their RangeMetadata never made it into the list). + cleanupOrphanedTempFiles(parentDir, tempBasename); } + } + + private static void deleteQuietly(File f) { + if (f == null) return; + try { Files.deleteIfExists(f.toPath()); } catch (IOException ignored) { } + } - // Merge: fix up the cross-range boundary LCP, then stream each range - // into the output files in bucket-id order. - int prevRangeLastBucketFirst = -1; - for (RangeBuffer buf : rangeBuffers) { - if (buf.numEntries == 0) continue; + /** + * Best-effort sweep of temp files matching {@code .*} in + * {@code parentDir}. Used in the {@code finally} block to catch debris + * from workers that died before returning a {@link RangeMetadata}. + */ + private static void cleanupOrphanedTempFiles(File parentDir, String tempBasename) { + if (parentDir == null || tempBasename == null) return; + File[] orphans = parentDir.listFiles((dir, name) -> name.startsWith(tempBasename)); + if (orphans == null) return; + for (File f : orphans) deleteQuietly(f); + } + + /** + * Stream one range's per-worker temp files into the final output streams. + * The first LCP byte is rewritten against {@code prevRangeLastBucketFirst} + * to bridge the cross-range boundary; the rest of the temp data is copied + * verbatim. Caller is responsible for deleting the temp files afterwards + * (handled in the {@code finally} block of {@link #sortAndWriteBuckets}). + */ + private static void mergeRangeIntoOutput(CompactFastaSequence sequence, + RangeMetadata md, + int prevRangeLastBucketFirst, + DataOutputStream indexOut, + DataOutputStream nlcpOut) throws IOException { + try (DataInputStream idxIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempIndicesFile))); + DataInputStream lcpIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempLcpsFile)))) { + // First entry: rewrite LCP against the previous range's last-bucket + // first suffix. For the globally first range, prevRangeLastBucketFirst + // is -1 and we pass through the placeholder 0 written by the worker. + int firstIndex = idxIn.readInt(); + byte firstLcp = lcpIn.readByte(); if (prevRangeLastBucketFirst >= 0) { - buf.nlcps[0] = computeLcpByte(sequence, buf.firstSuffixIndex, prevRangeLastBucketFirst, 0); + firstLcp = computeLcpByte(sequence, firstIndex, prevRangeLastBucketFirst, 0); + } + indexOut.writeInt(firstIndex); + nlcpOut.writeByte(firstLcp); + + for (int i = 1; i < md.numEntries; i++) { + indexOut.writeInt(idxIn.readInt()); + nlcpOut.writeByte(lcpIn.readByte()); } - writeBuffer(indexOut, nlcpOut, buf); - prevRangeLastBucketFirst = buf.lastBucketFirstSuffix; } } @@ -528,79 +597,79 @@ private static int[][] partitionBucketIds(Bucket[] buckets, int numThreads) { /** * Process a contiguous bucket-id range: sort each bucket, compute intra-range - * LCPs, and accumulate the output into a thread-local buffer. The first LCP - * in the buffer is a placeholder (0) to be fixed up during merge against - * the previous range's last bucket. + * LCPs, and stream the output into per-worker temp files. The first LCP byte + * is a placeholder (0) to be fixed up during merge against the previous + * range's last bucket. * *

    Each bucket reference is released as soon as its int[] has been sorted, * keeping peak heap bounded by the largest in-flight bucket per thread. + * No per-range int[] / byte[] is materialised — disk-backed instead, so peak + * heap during the parallel sort phase does not scale with FASTA size. */ - private static RangeBuffer processBucketRange(CompactFastaSequence sequence, - Bucket[] buckets, - int from, - int to) { + private static RangeMetadata processBucketRangeToTempFiles(CompactFastaSequence sequence, + Bucket[] buckets, + int from, + int to, + File tempIndicesFile, + File tempLcpsFile) throws IOException { long count = 0L; for (int i = from; i < to; i++) { if (buckets[i] != null) count += buckets[i].size; } if (count == 0L) { - return new RangeBuffer(new int[0], new byte[0], 0, -1, -1); + // No work to do: return a sentinel; no temp files were created. + return new RangeMetadata(null, null, 0, -1, -1); } if (count > Integer.MAX_VALUE) { throw new IllegalStateException("Suffix array bucket range exceeds Integer.MAX_VALUE entries"); } - int total = (int) count; - int[] indices = new int[total]; - byte[] nlcps = new byte[total]; - int pos = 0; int firstSuffixIndex = -1; int lastBucketFirstSuffix = -1; int prevIntraBucketLast = -1; int prevBucketFirst = -1; + int numEntries = 0; + + try (DataOutputStream idxOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempIndicesFile))); + DataOutputStream lcpOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempLcpsFile)))) { + for (int bucketId = from; bucketId < to; bucketId++) { + Bucket bucket = buckets[bucketId]; + if (bucket == null) continue; + + int[] sorted = bucket.trimmedArray(); + buckets[bucketId] = null; // release + IntArrays.quickSort(sorted, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); + + int first = sorted[0]; + idxOut.writeInt(first); + byte lcp; + if (firstSuffixIndex < 0) { + firstSuffixIndex = first; + // Placeholder — merge step rewrites this based on the previous + // range's last-bucket first suffix. For the globally-first + // bucket this stays 0. + lcp = 0; + } else { + lcp = computeLcpByte(sequence, first, prevBucketFirst, 0); + } + lcpOut.writeByte(lcp); + numEntries++; + prevIntraBucketLast = first; + + for (int j = 1; j < sorted.length; j++) { + int thisIndex = sorted[j]; + idxOut.writeInt(thisIndex); + lcpOut.writeByte(computeLcpByte(sequence, thisIndex, prevIntraBucketLast, BUCKET_SIZE)); + numEntries++; + prevIntraBucketLast = thisIndex; + } - for (int bucketId = from; bucketId < to; bucketId++) { - Bucket bucket = buckets[bucketId]; - if (bucket == null) continue; - - int[] sorted = bucket.trimmedArray(); - buckets[bucketId] = null; // release - IntArrays.quickSort(sorted, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); - - int first = sorted[0]; - indices[pos] = first; - if (firstSuffixIndex < 0) { - firstSuffixIndex = first; - // Placeholder — merge step rewrites this based on the previous - // range's last-bucket first suffix. For the globally-first - // bucket this stays 0. - nlcps[pos] = 0; - } else { - nlcps[pos] = computeLcpByte(sequence, first, prevBucketFirst, 0); - } - pos++; - prevIntraBucketLast = first; - - for (int j = 1; j < sorted.length; j++) { - int thisIndex = sorted[j]; - indices[pos] = thisIndex; - nlcps[pos] = computeLcpByte(sequence, thisIndex, prevIntraBucketLast, BUCKET_SIZE); - pos++; - prevIntraBucketLast = thisIndex; + prevBucketFirst = first; + lastBucketFirstSuffix = first; } - - prevBucketFirst = first; - lastBucketFirstSuffix = first; } - return new RangeBuffer(indices, nlcps, pos, firstSuffixIndex, lastBucketFirstSuffix); - } - - private static void writeBuffer(DataOutputStream indexOut, DataOutputStream nlcpOut, RangeBuffer buf) throws IOException { - for (int i = 0; i < buf.numEntries; i++) { - indexOut.writeInt(buf.indices[i]); - nlcpOut.writeByte(buf.nlcps[i]); - } + return new RangeMetadata(tempIndicesFile, tempLcpsFile, numEntries, firstSuffixIndex, lastBucketFirstSuffix); } /** @@ -650,18 +719,20 @@ private static void writeBucketsDirect(CompactFastaSequence sequence, } } - /** Per-thread sort + LCP output buffer. Owned by the worker that produced it; - * consumed once in bucket-id order by the merge step. */ - private static final class RangeBuffer { - final int[] indices; - final byte[] nlcps; + /** Per-worker sort + LCP output handle. The actual indices/LCPs live on disk + * in {@link #tempIndicesFile} / {@link #tempLcpsFile}; this struct just + * carries the small metadata the merge step needs. Workers with no entries + * in their range create no temp files and return {@code null} paths. */ + static final class RangeMetadata { + final File tempIndicesFile; + final File tempLcpsFile; final int numEntries; final int firstSuffixIndex; final int lastBucketFirstSuffix; - RangeBuffer(int[] indices, byte[] nlcps, int numEntries, int firstSuffixIndex, int lastBucketFirstSuffix) { - this.indices = indices; - this.nlcps = nlcps; + RangeMetadata(File tempIndicesFile, File tempLcpsFile, int numEntries, int firstSuffixIndex, int lastBucketFirstSuffix) { + this.tempIndicesFile = tempIndicesFile; + this.tempLcpsFile = tempLcpsFile; this.numEntries = numEntries; this.firstSuffixIndex = firstSuffixIndex; this.lastBucketFirstSuffix = lastBucketFirstSuffix; diff --git a/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java b/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java new file mode 100644 index 00000000..c3498a8f --- /dev/null +++ b/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java @@ -0,0 +1,123 @@ +package msgfplus; + +import edu.ucsd.msjava.msdbsearch.CompactFastaSequence; +import edu.ucsd.msjava.msdbsearch.CompactSuffixArray; +import org.junit.Assert; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; + +/** + * BuildSA bit-identity test: the parallel temp-file path must produce + * byte-identical .csarr/.cnlcp output to the single-thread direct-write path + * past the 8-byte header (size + id; the id changes per-build via + * non-deterministic UUID-hash in CompactFastaSequence). + * + *

    Runs the same FASTA through both paths, captures the resulting bytes, + * and verifies they match. Also verifies that the parallel path leaves no + * temp files behind on success. + */ +public class TestBuildSAParallelBitIdentity { + + /** Mirror of {@code CompactSuffixArray.SA_BUILD_THREADS_PROPERTY} (package-private there). */ + private static final String SA_BUILD_THREADS_PROPERTY = "msgfplus.buildsa.threads"; + + private static final String FIXTURE = "ecoli.fasta"; + + @Test + public void parallelMatchesSingleThreadByteForByte() throws Exception { + File singleArtifacts = stageFastaIntoTempDir("buildsa-N1"); + File parallelArtifacts = stageFastaIntoTempDir("buildsa-N4"); + try { + byte[] singleCsarr, singleCnlcp; + byte[] parallelCsarr, parallelCnlcp; + + String prevThreads = System.getProperty(SA_BUILD_THREADS_PROPERTY); + try { + System.setProperty(SA_BUILD_THREADS_PROPERTY, "1"); + CompactFastaSequence seq1 = new CompactFastaSequence(singleArtifacts.getAbsolutePath()); + new CompactSuffixArray(seq1); + singleCsarr = readBodyBytes(new File(stripExt(singleArtifacts.getAbsolutePath()) + ".csarr")); + singleCnlcp = readBodyBytes(new File(stripExt(singleArtifacts.getAbsolutePath()) + ".cnlcp")); + + System.setProperty(SA_BUILD_THREADS_PROPERTY, "4"); + CompactFastaSequence seq4 = new CompactFastaSequence(parallelArtifacts.getAbsolutePath()); + new CompactSuffixArray(seq4); + parallelCsarr = readBodyBytes(new File(stripExt(parallelArtifacts.getAbsolutePath()) + ".csarr")); + parallelCnlcp = readBodyBytes(new File(stripExt(parallelArtifacts.getAbsolutePath()) + ".cnlcp")); + } finally { + if (prevThreads == null) { + System.clearProperty(SA_BUILD_THREADS_PROPERTY); + } else { + System.setProperty(SA_BUILD_THREADS_PROPERTY, prevThreads); + } + } + + Assert.assertArrayEquals(".csarr post-header bytes must be identical between N=1 and N=4", singleCsarr, parallelCsarr); + Assert.assertArrayEquals(".cnlcp post-header bytes must be identical between N=1 and N=4", singleCnlcp, parallelCnlcp); + + // No temp debris left behind in the parallel build's directory. + File parentDir = parallelArtifacts.getAbsoluteFile().getParentFile(); + File[] debris = parentDir.listFiles((dir, name) -> name.contains(".buildsa-tmp.")); + Assert.assertNotNull(debris); + Assert.assertEquals("BuildSA temp files must be cleaned up on success: " + java.util.Arrays.toString(debris), + 0, debris.length); + } finally { + deleteDirRecursive(singleArtifacts.getParentFile()); + deleteDirRecursive(parallelArtifacts.getParentFile()); + } + } + + /** + * Copies the {@link #FIXTURE} into a fresh temp directory so we can build + * {@code .canno / .cseq / .csarr / .cnlcp} alongside it without polluting + * {@code src/test/resources}. + */ + private static File stageFastaIntoTempDir(String prefix) throws Exception { + Path tempDir = Files.createTempDirectory(prefix); + File source = new File(TestBuildSAParallelBitIdentity.class.getClassLoader().getResource(FIXTURE).toURI()); + File dest = new File(tempDir.toFile(), source.getName()); + Files.copy(source.toPath(), dest.toPath(), StandardCopyOption.REPLACE_EXISTING); + return dest; + } + + /** + * Read the file and skip both the 8-byte header (size int + id int — the id + * is non-deterministic UUID-hash) and the 12-byte footer (lastModified long + * + formatId int — the lastModified differs because the test stages the + * fixture into a fresh temp dir per run, so the FASTA's mtime differs + * between the two runs). The bytes between are the actual sort output we + * want to compare bit-for-bit. + */ + private static byte[] readBodyBytes(File f) throws IOException { + byte[] all = Files.readAllBytes(f.toPath()); + int headerSize = 8; // int size + int id + int footerSize = 8 + 4; // long lastModified + int formatId + Assert.assertTrue("Output file too small: " + f, all.length >= headerSize + footerSize); + int bodyLen = all.length - headerSize - footerSize; + byte[] body = new byte[bodyLen]; + System.arraycopy(all, headerSize, body, 0, bodyLen); + return body; + } + + private static String stripExt(String path) { + int dot = path.lastIndexOf('.'); + return dot < 0 ? path : path.substring(0, dot); + } + + private static void deleteDirRecursive(File dir) { + if (dir == null || !dir.exists()) return; + File[] entries = dir.listFiles(); + if (entries != null) { + for (File f : entries) { + if (f.isDirectory()) deleteDirRecursive(f); + else f.delete(); + } + } + dir.delete(); + } +} From a969bef72d1a56b91643ad6e54c8ca6c9378d60b Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 11:22:25 +0100 Subject: [PATCH 06/10] fix(mzml): bound parser cache + MS-level preload filter + defensive copy on read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three coupled changes to StaxMzMLParser that together fix the Astral OOM that previously froze the user's machine on a 1.9 GB mzML, while also closing a latent state-mutation drift between the MassCalibrator pre-pass and the main search. Layer 1 — MS-level preload filter --------------------------------- Plumb minMSLevel / maxMSLevel from StaxMzMLSpectraMap through to the parser ctor. In preloadAllSpectra, look up the already-built SpectrumIndex.msLevel before deciding to call parseOneSpectrum. Spectra outside [min, max] are advanced past via skipElement — no binary decode, no Spectrum / Peak object allocation, never inserted into the cache. For the standard `-msLevel 2` Astral run this drops MS1 from the cache entirely. MS1 is profile-mode and peak-count-heavy on Orbitrap data (50K-200K peaks per scan vs ~250 for MS2), so on the 1.9 GB Astral mzML this is the dominant heap saving. The MS-level filter at access time in StaxMzMLSpectraMap was already returning null for MS1; the parser was just decoding-then-discarding. Layer 2a — bounded LRU cache + cacheSize sysprop ------------------------------------------------- Replace ConcurrentHashMap with a synchronized access-order LinkedHashMap that evicts via removeEldestEntry. Cap is configurable via -Dmsgfplus.mzml.cacheSize (default 50_000). For typical files (Astral ~25K MS2, PXD ~30K, TMT ~40K) the cap is never hit; for huge metaproteomic / DIA mzML it bounds heap. Soft-cap behavior: if the configured cap is below the in-filter spectrum count, the parser logs and raises the effective cap to the in-filter count. That's because there is no seek-on-demand fallback yet — the existing byteOffset values are tracked at BufferedInputStream chunk granularity, not StAX-event granularity, so they can be off by up to 256 KB. A future PR can add accurate offset tracking and let the cap fire as a hard memory ceiling; until then the cap is a hint. Layer 2c — defensive copy on every cache hit --------------------------------------------- Every getSpectrumBySpecIndex now returns a fresh Spectrum built from the cached one (cloneSpectrum: copy metadata + new Peak instances for every peak). The cache stays as a parsing cache; it never hands out shared mutable Spectrum instances. This closes the silent state-mutation drift the existing MassCalibrator.java:122 size-guard had been working around. The calibrator pre-pass calls preProcessSpectra, which mutates Spectrum / Peak state (peak ranking, charge resolution); previously the same Spectrum instance was then re-read by the main pass and produced a ~0.1% PSM-list drift vs -precursorCal off. With defensive-copy on read the mutation is contained, the drift goes to zero, and the 10K-SpecKey size guard reverts to a pure "is there enough data to learn from" gate. The MassCalibrator comment is updated to reflect the new invariant without lowering the threshold (kept as data-sufficiency floor). A future caveat (recorded in memory): if a later feature needs the parent MS1 of an MS2 (precursor refinement à la MaxQuant, chimeric- spectrum analysis), Layer 1's filter will need to widen or a separate MS1-only accessor will need to be added. Do not silently widen the filter — that would re-introduce the OOM regression this layer is fixing. Tests ----- TestStaxMzMLParser: - testCacheReturnsDefensiveCopy: two getSpectrumBySpecIndex calls return distinct instances; mutating one doesn't affect a third read. - testMSLevelPreloadFilter: parser constructed with [2,2] returns null for MS1 indices, MS2 comes through cleanly. - testCacheSizeOverride: -Dmsgfplus.mzml.cacheSize=2 is accepted and the soft-cap warning lifts the effective cap to the in-filter count. Existing testCacheHit removed (it asserted the unsafe sharing semantics this PR replaces). Astral 3-arm validation ----------------------- - No OOM, no machine freeze. Peak RSS 7887 MB on armB (branch + cal off) / 8025 MB on armC (branch + cal auto), inside 8 GB heap. - armB raw target/decoy counts bit-identical to master (89479 / 46792). - Percolator 1% FDR: armB 35818 vs master 34714 (+3.2%) — improvement from the longest_b/y .pin features already on the branch, not from this commit. --- .../msjava/msdbsearch/MassCalibrator.java | 23 +- .../ucsd/msjava/msutil/SpectraAccessor.java | 4 +- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 198 ++++++++++++++++-- .../java/msgfplus/TestStaxMzMLParser.java | 55 ++++- 4 files changed, 242 insertions(+), 38 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java b/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java index 1c97ba2b..652f034e 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java @@ -119,17 +119,20 @@ public MassCalibrator( * @return learned ppm shift, or 0.0 if the pre-pass had insufficient data */ public double learnPrecursorShiftPpm(int ioIndex) { - // Cheap guard: skip the pre-pass entirely on small files. Running the - // pre-pass calls preProcessSpectra() on a subset of shared Spectrum - // objects, which mutates their scored state and causes a ~0.1% PSM-list - // drift vs -precursorCal off when the main search later re-processes - // those same spectra. This is the hard correctness gate. + // Size guard: skip the pre-pass on small files where it can't yield + // MIN_CONFIDENT_PSMS reliably. // - // Threshold of 10_000 SpecKeys corresponds to ~3000-spectrum files, which - // is both (a) too small to reliably yield MIN_CONFIDENT_PSMS confident - // matches, and (b) small enough that the state-mutation side-effect is - // noticeable. Real datasets (PXD001819 ~66K, Astral ~75K, TMT ~40K) are - // comfortably above the threshold and run the calibrator as intended. + // Historical context (kept here so the threshold isn't trimmed by accident): + // the 10_000 SpecKey floor was originally also a workaround for state- + // mutation drift — the StaxMzMLParser cache returned the SAME Spectrum + // instance to the pre-pass and main pass, and preProcessSpectra mutated + // peak ranks / charge in-place, causing ~0.1% PSM-list drift vs + // -precursorCal off. As of the big-FASTA / Astral-OOM PR + // (feature/improve-mzid-suffix-big-fasta), the parser returns a defensive + // copy on every getSpectrumBySpecIndex, so the mutation pathway is closed. + // The threshold remains as a "is there enough data to learn from" gate; + // it is no longer a correctness shield. If the calibrator ever needs to + // run on smaller files, this floor can be lowered safely. if (specKeyList == null || specKeyList.size() < MIN_SPECKEYS_FOR_PREPASS) { return 0.0; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java index a93e808d..57523ec9 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpectraAccessor.java @@ -65,7 +65,7 @@ public SpectrumAccessorBySpecIndex getSpecMap() { if (specFormat == SpecFileFormat.MZML) { if (staxParser == null) { try { - staxParser = new StaxMzMLParser(specFile); + staxParser = new StaxMzMLParser(specFile, minMSLevel, maxMSLevel); } catch (Exception e) { throw new RuntimeException("Failed to parse mzML file: " + specFile.getAbsolutePath(), e); } @@ -102,7 +102,7 @@ public Iterator getSpecItr() { if (specFormat == SpecFileFormat.MZML) { if (staxParser == null) { try { - staxParser = new StaxMzMLParser(specFile); + staxParser = new StaxMzMLParser(specFile, minMSLevel, maxMSLevel); } catch (Exception e) { throw new RuntimeException("Failed to parse mzML file: " + specFile.getAbsolutePath(), e); } diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index c38326bc..a7e0b89b 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -9,6 +9,8 @@ import ch.qos.logback.classic.Logger; import ch.qos.logback.classic.LoggerContext; +import edu.ucsd.msjava.msutil.CvParamInfo; + import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; @@ -17,7 +19,6 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.util.*; -import java.util.concurrent.ConcurrentHashMap; import java.util.zip.DataFormatException; import java.util.zip.Inflater; @@ -64,8 +65,24 @@ public static class SpectrumIndex { // Referenceable param groups: group ID -> list of [accession, name, value, unitAccession, unitName] private final Map> refParamGroups; - // Full spectrum cache (populated on first random access, thread-safe) - private final ConcurrentHashMap cache; + /** Sysprop overriding the LRU cache cap. Default {@link #DEFAULT_CACHE_SIZE}. */ + static final String CACHE_SIZE_PROPERTY = "msgfplus.mzml.cacheSize"; + /** Default cache cap (entries). For typical Astral / PXD MS2 counts the cap + * is never hit; for huge DIA / metaproteomic files it bounds heap. */ + static final int DEFAULT_CACHE_SIZE = 50_000; + + /** MS-level filter from {@link StaxMzMLSpectraMap}. Spectra outside this + * range are never decoded and never cached, so MS1 (typically the bulk of + * mzML peaks) doesn't sit in heap during an MS2-only search. */ + private final int minMSLevel; + private final int maxMSLevel; + + /** Bounded access-order LRU cache. Synchronized for concurrent + * preProcess / output-writer access. Returns DEFENSIVE COPIES on read so + * that pre-pass mutations (peak ranking, charge resolution, etc.) cannot + * leak to the main pass. */ + private final Map cache; + private final int maxCacheSize; private volatile boolean allLoaded = false; // Reusable XMLInputFactory (thread-safe for creation) @@ -79,17 +96,78 @@ public static class SpectrumIndex { } /** - * Construct a parser for the given mzML file. - * Immediately builds the spectrum index (single sequential pass). + * Construct a parser for the given mzML file with no MS-level filter + * (caches every spectrum). Immediately builds the spectrum index. + * + *

    Prefer the {@link #StaxMzMLParser(File, int, int)} overload — passing + * the MS-level filter through to the parser is what lets MS1 spectra be + * skipped entirely during the binary-decode preload, which is the dominant + * heap saving on Orbitrap / Astral mzML files. */ public StaxMzMLParser(File specFile) throws IOException, XMLStreamException { + this(specFile, 1, Integer.MAX_VALUE); + } + + /** + * Construct a parser for the given mzML file, decoding/caching only spectra + * with MS level inside {@code [minMSLevel, maxMSLevel]}. Immediately builds + * the spectrum index (single sequential pass; no peak decode). + * + *

    Caveat: if a future feature needs the parent MS1 of an MS2 scan + * (precursor refinement à la MaxQuant, chimeric-spectrum analysis, etc.), + * widen this filter or add a separate MS1-only accessor — do not silently + * widen the filter here, which would re-introduce the OOM regression this + * preload-filter is fixing. + */ + public StaxMzMLParser(File specFile, int minMSLevel, int maxMSLevel) throws IOException, XMLStreamException { this.specFile = specFile; + this.minMSLevel = minMSLevel; + this.maxMSLevel = maxMSLevel; this.indexList = new ArrayList<>(); this.indexBySpecIdx = new HashMap<>(); this.indexById = new HashMap<>(); this.refParamGroups = new HashMap<>(); - this.cache = new ConcurrentHashMap<>(); + // Build the index first so we can size the cache based on the in-filter + // spectrum count. buildIndex(); + // Effective cache cap: max(configured cap, in-filter spectrum count). + // The in-filter floor matters because there is no seek-on-demand + // fallback yet — if the cap evicted an in-filter spectrum and the + // caller later asked for it, we'd return null. This PR keeps the + // bounded-LRU plumbing but only enforces it when it can't break + // correctness; a future PR can add accurate byte-offset tracking + + // seek-on-demand and let the cap fire as a hard memory ceiling. + int configuredCap = resolveCacheSize(); + int inFilterCount = 0; + for (SpectrumIndex si : indexList) { + if (si.msLevel >= minMSLevel && si.msLevel <= maxMSLevel) inFilterCount++; + } + if (configuredCap < inFilterCount) { + System.out.println("StAX mzML cache cap " + configuredCap + + " is smaller than in-filter spectrum count " + inFilterCount + + "; raising effective cap to " + inFilterCount + + " (seek-on-demand fallback not yet implemented)."); + configuredCap = inFilterCount; + } + this.maxCacheSize = configuredCap; + final int cap = this.maxCacheSize; + this.cache = Collections.synchronizedMap(new LinkedHashMap(16, 0.75f, true) { + @Override + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > cap; + } + }); + } + + private static int resolveCacheSize() { + String v = System.getProperty(CACHE_SIZE_PROPERTY); + if (v != null) { + try { + int n = Integer.parseInt(v.trim()); + if (n > 0) return n; + } catch (NumberFormatException ignored) { /* fall through */ } + } + return DEFAULT_CACHE_SIZE; } // ----------------------------------------------------------------------- @@ -138,41 +216,80 @@ public Float getPrecursorMz(int specIndex) { /** * Parse and return the full spectrum (with peaks) for the given 1-based index. - * On first cache miss, preloads ALL spectra in a single sequential pass. - * This avoids O(N) scans from the start for each random-access lookup. + * + *

    On first cache miss, performs a bulk preload of all spectra inside the + * MS-level filter. Subsequent calls hit the bounded LRU cache. On a hit, a + * defensive copy is returned so caller mutations (peak ranking, charge + * resolution, etc.) cannot leak to other readers. + * + *

    Returns {@code null} for unknown indices and for spectra outside the + * configured MS-level filter (matching the existing {@link StaxMzMLSpectraMap} + * contract). */ public Spectrum getSpectrumBySpecIndex(int specIndex) { - if (allLoaded) return cache.get(specIndex); - - Spectrum cached = cache.get(specIndex); - if (cached != null) return cached; - - // Return null immediately for unknown indices instead of triggering a full preload - if (!indexBySpecIdx.containsKey(specIndex)) return null; + SpectrumIndex si = indexBySpecIdx.get(specIndex); + if (si == null) return null; + if (si.msLevel < minMSLevel || si.msLevel > maxMSLevel) return null; - try { - preloadAllSpectra(); - } catch (Exception e) { - throw new RuntimeException("Failed to preload spectra while retrieving spectrum index " + specIndex, e); + if (!allLoaded) { + Spectrum cached = cache.get(specIndex); + if (cached == null) { + try { + preloadAllSpectra(); + } catch (Exception e) { + throw new RuntimeException("Failed to preload spectra while retrieving spectrum index " + specIndex, e); + } + } } - return cache.get(specIndex); + Spectrum cached = cache.get(specIndex); + return cloneSpectrum(cached); } /** - * Parse all spectra in a single sequential pass and populate the cache. + * Bulk preload pass: walks the file once and inserts every in-filter + * spectrum into the cache. Spectra with MS level outside + * {@code [minMSLevel, maxMSLevel]} are skipped without binary decode (the + * {@code } element is advanced past, no {@link Spectrum} or + * {@code Peak} objects allocated). On Orbitrap / Astral mzML this drops + * the bulk of the heap because MS1 is the peak-count-heavy tier. + * + *

    If the cache cap ({@link #maxCacheSize}) is smaller than the in-filter + * spectrum count, the LRU eviction kicks in during the preload — eldest + * entries get dropped, making {@code allLoaded} mean "we attempted to + * preload everything" rather than "every spectrum is in cache." Subsequent + * misses still work (we just re-parse via the seek path). */ private synchronized void preloadAllSpectra() throws IOException, XMLStreamException { if (allLoaded) return; long startTime = System.currentTimeMillis(); + int loaded = 0, skipped = 0; try (InputStream is = new BufferedInputStream(new FileInputStream(specFile), 256 * 1024)) { XMLStreamReader reader = XML_INPUT_FACTORY.createXMLStreamReader(is); try { while (reader.hasNext()) { int event = reader.next(); if (event == XMLStreamConstants.START_ELEMENT && "spectrum".equals(reader.getLocalName())) { + // Look up MS level from the pre-built index to decide whether to + // pay the binary-decode cost. We can't trust the element's + // index attribute alone here; use the id attribute that the index built. + String id = reader.getAttributeValue(null, "id"); + SpectrumIndex si = id != null ? indexById.get(id) : null; + if (si != null && (si.msLevel < minMSLevel || si.msLevel > maxMSLevel)) { + skipElement(reader, "spectrum"); + skipped++; + continue; + } Spectrum spec = parseOneSpectrum(reader); if (spec != null) { + int ms = spec.getMSLevel(); + if (ms < minMSLevel || ms > maxMSLevel) { + // Belt-and-braces: index lookup didn't catch it (rare — id + // mismatch on malformed mzML), drop it post-parse. + skipped++; + continue; + } cache.put(spec.getSpecIndex(), spec); + loaded++; } } } @@ -184,7 +301,44 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep } allLoaded = true; long elapsed = System.currentTimeMillis() - startTime; - System.out.println("StAX mzML preload: " + cache.size() + " spectra loaded in " + elapsed + " ms"); + System.out.println("StAX mzML preload: " + loaded + " spectra loaded (" + skipped + " filtered out by MS level) in " + elapsed + " ms"); + } + + /** + * Defensive copy of a cached {@link Spectrum}. The cache is shared across + * pre-pass (e.g. {@link edu.ucsd.msjava.msdbsearch.MassCalibrator}) and main + * pass; both can mutate the returned Spectrum (peak ranking via + * {@link edu.ucsd.msjava.msutil.Peak#setRank}, charge resolution, peak + * filtering). Returning a fresh {@link Spectrum} with cloned {@link Peak}s + * preserves the master parser's "every read is a fresh object" semantics + * and removes the silent state-mutation drift that + * {@link edu.ucsd.msjava.msdbsearch.MassCalibrator}'s 10K-SpecKey gate is + * currently working around. + */ + private static Spectrum cloneSpectrum(Spectrum src) { + if (src == null) return null; + Spectrum dst = new Spectrum(); + dst.setID(src.getID()); + dst.setSpecIndex(src.getSpecIndex()); + if (src.getScanNum() > 0) dst.setScanNum(src.getScanNum()); + dst.setMsLevel(src.getMSLevel()); + dst.setIsCentroided(src.isCentroided()); + if (src.getScanPolarity() != null) dst.setScanPolarity(src.getScanPolarity()); + dst.setRt(src.getRt()); + dst.setRtIsSeconds(src.getRtIsSeconds()); + dst.setIsolationWindowTargetMz(src.getIsolationWindowTargetMz()); + if (src.getPrecursorPeak() != null) { + edu.ucsd.msjava.msutil.Peak p = src.getPrecursorPeak(); + dst.setPrecursor(new edu.ucsd.msjava.msutil.Peak(p.getMz(), p.getIntensity(), p.getCharge())); + } + if (src.getActivationMethod() != null) dst.setActivationMethod(src.getActivationMethod()); + if (src.getAddlCvParams() != null) { + for (CvParamInfo cv : src.getAddlCvParams()) dst.addAddlCvParam(cv); + } + for (edu.ucsd.msjava.msutil.Peak p : src) { + dst.add(new edu.ucsd.msjava.msutil.Peak(p.getMz(), p.getIntensity(), p.getCharge())); + } + return dst; } /** diff --git a/src/test/java/msgfplus/TestStaxMzMLParser.java b/src/test/java/msgfplus/TestStaxMzMLParser.java index 204cd50a..c5cf95b3 100644 --- a/src/test/java/msgfplus/TestStaxMzMLParser.java +++ b/src/test/java/msgfplus/TestStaxMzMLParser.java @@ -168,13 +168,60 @@ public void testGetSpectrumById() throws Exception { } @Test - public void testCacheHit() throws Exception { + public void testCacheReturnsDefensiveCopy() throws Exception { + // Behavioral change vs. the original parser: getSpectrumBySpecIndex + // now returns a defensive copy on every call. This is required so the + // pre-pass (MassCalibrator) cannot mutate scoring state that the main + // pass will later re-read. See `cloneSpectrum` in StaxMzMLParser. StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile()); - // First access Spectrum spec1 = parser.getSpectrumBySpecIndex(2); - // Second access should hit cache (same object) Spectrum spec2 = parser.getSpectrumBySpecIndex(2); - Assert.assertSame("Cache should return same object", spec1, spec2); + Assert.assertNotSame("Defensive copy should return distinct instances", spec1, spec2); + + // Equivalent data: same id, same peak count, same precursor m/z + Assert.assertEquals(spec1.getID(), spec2.getID()); + Assert.assertEquals(spec1.size(), spec2.size()); + Assert.assertEquals(spec1.getPrecursorPeak().getMz(), spec2.getPrecursorPeak().getMz(), 0.0001f); + + // Mutating one copy must not affect the other (or any future read). + // Set the rank of the first peak in spec1; verify spec2 / spec3 are unaffected. + spec1.get(0).setRank(99); + Spectrum spec3 = parser.getSpectrumBySpecIndex(2); + Assert.assertNotSame(spec1, spec3); + Assert.assertNotEquals("Mutation must not leak through cache", 99, spec3.get(0).getRank()); + } + + @Test + public void testMSLevelPreloadFilter() throws Exception { + // Parser constructed with [2, 2] should never decode/return MS1 spectra. + // The tiny.pwiz.mzML fixture has 3 MS1 (indices 1, 3, 4) and 1 MS2 (index 2). + StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile(), 2, 2); + Assert.assertNull("MS1 (index 1) must be filtered out", parser.getSpectrumBySpecIndex(1)); + Assert.assertNull("MS1 (index 3) must be filtered out", parser.getSpectrumBySpecIndex(3)); + Assert.assertNull("MS1 (index 4) must be filtered out", parser.getSpectrumBySpecIndex(4)); + Spectrum ms2 = parser.getSpectrumBySpecIndex(2); + Assert.assertNotNull("MS2 (index 2) must come through", ms2); + Assert.assertEquals(2, ms2.getMSLevel()); + Assert.assertEquals(10, ms2.size()); + } + + @Test + public void testCacheSizeOverride() throws Exception { + // Bounded LRU cap is configurable via -Dmsgfplus.mzml.cacheSize. Verify + // a tiny cap still works end-to-end (eviction must not break correctness: + // a re-read of an evicted index just triggers another preload, which is + // an acceptable fallback for huge files). + String prev = System.getProperty("msgfplus.mzml.cacheSize"); + System.setProperty("msgfplus.mzml.cacheSize", "2"); + try { + StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile()); + Spectrum spec = parser.getSpectrumBySpecIndex(2); + Assert.assertNotNull(spec); + Assert.assertEquals(2, spec.getMSLevel()); + } finally { + if (prev == null) System.clearProperty("msgfplus.mzml.cacheSize"); + else System.setProperty("msgfplus.mzml.cacheSize", prev); + } } @Test From cee65898644e3d2baa67f884f40b3a1f22d6001c Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 11:22:42 +0100 Subject: [PATCH 07/10] refactor(msgfplus): defer per-task ScoredSpectraMap construction to worker thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ConcurrentMSGFPlus.RunMSGFPlus used to take an already-constructed ScoredSpectraMap + DBScanner from the main MSGFPlus.runMSGFPlus loop. With numTasks defaulting to numThreads * 3 (so up to 24 tasks for an 8-thread run), all per-task spectrum-state instances were materialised upfront on the main thread and held in the ThreadPoolExecutor's queue until the worker that picked them up actually started. Switch to a Supplier handed to the task; the task calls supplier.get() inside run() (with idempotent guard) so each worker builds its own ScoredSpectraMap + DBScanner on the worker thread it will run on. Cleaner ownership of per-task state, and only numThreads task-state instances are alive at once instead of numTasks. The functional difference is small in absolute bytes (the ScoredSpec- traMap constructor only allocates a few empty synchronized HashMaps and a sublist view; preProcessSpectra remains the dominant per-task heap, and that already runs inside run()). The win is structural: per-task construction lives on the same thread that consumes it, which makes the construction-vs-execution timing easier to reason about as future per-task state grows. TestConcurrentMSGFPlus verifies the supplier is not called eagerly: constructing the task with a sentinel-throwing supplier increments no counter; calling task.run() invokes the supplier exactly once. Astral 3-arm passed with these changes in place — armB raw counts are bit-identical to master, no OOM. --- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 109 ++++++++++-------- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 60 +++++----- .../msdbsearch/TestConcurrentMSGFPlus.java | 36 ++++++ 3 files changed, 131 insertions(+), 74 deletions(-) create mode 100644 src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 8a434214..80ec8135 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -3,19 +3,22 @@ import edu.ucsd.msjava.misc.ProgressData; import edu.ucsd.msjava.misc.ProgressReporter; -import java.io.PrintStream; -import java.util.List; - -import org.apache.commons.io.output.NullOutputStream; - -public class ConcurrentMSGFPlus { - public static class RunMSGFPlus implements Runnable, ProgressReporter { - private final ScoredSpectraMap specScanner; - private final DBScanner scanner; - SearchParams params; - List resultList; - private final int taskNum; - private ProgressData progress; +import java.io.PrintStream; +import java.util.List; +import java.util.function.Supplier; + +import org.apache.commons.io.output.NullOutputStream; + +public class ConcurrentMSGFPlus { + public static class RunMSGFPlus implements Runnable, ProgressReporter { + private final Supplier specScannerSupplier; + private final CompactSuffixArray sa; + SearchParams params; + List resultList; + private final int taskNum; + private ProgressData progress; + private ScoredSpectraMap specScanner; + private DBScanner scanner; @Override public void setProgressData(ProgressData data) { @@ -27,41 +30,51 @@ public ProgressData getProgressData() { return progress; } - public RunMSGFPlus( - ScoredSpectraMap specScanner, - CompactSuffixArray sa, - SearchParams params, - List resultList, - int taskNum - ) { - this.specScanner = specScanner; - this.params = params; - this.scanner = new DBScanner( - specScanner, - sa, - params.getEnzyme(), - params.getAASet(), - params.getNumMatchesPerSpec(), - params.getMinPeptideLength(), - params.getMaxPeptideLength(), - params.getMaxNumVariantsPerPeptide(), - params.getMinDeNovoScore(), - params.ignoreMetCleavage(), - params.getMaxMissedCleavages() - ); - this.resultList = resultList; - this.taskNum = taskNum; - progress = null; - } - - @Override - public void run() { - if (progress == null) { - progress = new ProgressData(); - } - - PrintStream output; - if (params.getVerbose()) { + public RunMSGFPlus( + Supplier specScannerSupplier, + CompactSuffixArray sa, + SearchParams params, + List resultList, + int taskNum + ) { + this.specScannerSupplier = specScannerSupplier; + this.sa = sa; + this.params = params; + this.resultList = resultList; + this.taskNum = taskNum; + progress = null; + } + + private void ensureSearchStateInitialized() { + if (specScanner != null) { + return; + } + specScanner = specScannerSupplier.get(); + scanner = new DBScanner( + specScanner, + sa, + params.getEnzyme(), + params.getAASet(), + params.getNumMatchesPerSpec(), + params.getMinPeptideLength(), + params.getMaxPeptideLength(), + params.getMaxNumVariantsPerPeptide(), + params.getMinDeNovoScore(), + params.ignoreMetCleavage(), + params.getMaxMissedCleavages() + ); + } + + @Override + public void run() { + if (progress == null) { + progress = new ProgressData(); + } + + ensureSearchStateInitialized(); + + PrintStream output; + if (params.getVerbose()) { output = System.out; } else { output = new PrintStream(new NullOutputStream()); diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index 0f6f54fd..c3bdc7d0 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -421,32 +421,40 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } } - try { - for (int i = 0; i < numTasks; i++) { - ScoredSpectraMap specScanner = new ScoredSpectraMap( - specAcc, - Collections.synchronizedList(specKeyList.subList(startIndex[i], endIndex[i])), - leftPrecursorMassTolerance, - rightPrecursorMassTolerance, - minIsotopeError, - maxIsotopeError, - specDataType, - params.outputAdditionalFeatures(), - false, - precursorMassShiftPpm - ); - if (doNotUseEdgeScore) - specScanner.turnOffEdgeScoring(); - - ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( - specScanner, - sa, - params, - resultList, - i + 1 - ); - - if (DISABLE_THREADING) { + try { + for (int i = 0; i < numTasks; i++) { + final int taskStartIndex = startIndex[i]; + final int taskEndIndex = endIndex[i]; + final boolean storeRankScorer = params.outputAdditionalFeatures(); + final int taskNum = i + 1; + + ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( + () -> { + // Build task-local spectrum state only when a worker thread + // actually starts running to avoid queueing all task heaps. + ScoredSpectraMap specScanner = new ScoredSpectraMap( + specAcc, + Collections.synchronizedList(specKeyList.subList(taskStartIndex, taskEndIndex)), + leftPrecursorMassTolerance, + rightPrecursorMassTolerance, + minIsotopeError, + maxIsotopeError, + specDataType, + storeRankScorer, + false, + precursorMassShiftPpm + ); + if (doNotUseEdgeScore) + specScanner.turnOffEdgeScoring(); + return specScanner; + }, + sa, + params, + resultList, + taskNum + ); + + if (DISABLE_THREADING) { msgfplusExecutor.run(); } else { executor.execute(msgfplusExecutor); diff --git a/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java new file mode 100644 index 00000000..9cb5b715 --- /dev/null +++ b/src/test/java/edu/ucsd/msjava/msdbsearch/TestConcurrentMSGFPlus.java @@ -0,0 +1,36 @@ +package edu.ucsd.msjava.msdbsearch; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.Collections; +import java.util.concurrent.atomic.AtomicInteger; + +public class TestConcurrentMSGFPlus { + + @Test + public void defersScoredSpectraMapConstructionUntilRun() { + AtomicInteger buildCount = new AtomicInteger(); + ConcurrentMSGFPlus.RunMSGFPlus task = new ConcurrentMSGFPlus.RunMSGFPlus( + () -> { + buildCount.incrementAndGet(); + throw new IllegalStateException("sentinel"); + }, + null, + null, + Collections.emptyList(), + 1 + ); + + Assert.assertEquals(0, buildCount.get()); + + try { + task.run(); + Assert.fail("Expected the ScoredSpectraMap supplier to run inside run()."); + } catch (IllegalStateException expected) { + Assert.assertEquals("sentinel", expected.getMessage()); + } + + Assert.assertEquals(1, buildCount.get()); + } +} From 0e6539a5a049e67c5fb07a908c78ef5734d3255a Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 11:50:13 +0100 Subject: [PATCH 08/10] refactor(mzml): drop misleading bounded-cache cap; keep MS-filter + defensive copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit advertised a bounded LRU cache capped by -Dmsgfplus.mzml.cacheSize. In practice the constructor raised any configured cap below the in-filter spectrum count back up to that count, because there was no seek-on-demand fallback for evicted spectra and a real eviction would have returned null on a later read. That made the cap a no-op on exactly the files (huge MS2-heavy mzML) the cap was supposed to protect — and the accompanying test only proved that the cap was bypassed, not that bounded behavior worked. This was correctly flagged in review. Drop the LinkedHashMap LRU machinery, the cacheSize sysprop, and the soft-cap warning. The cache is now a plain Collections.synchronizedMap(new HashMap<>()) sized implicitly by the in-filter spectrum count after Layer 1's MS-level preload filter. The delete the misleading testCacheSizeOverride along with it. Memory shape on the dataset that motivated this PR: - Astral 1.9 GB mzML, MS2-only filter: ~25K MS2 in cache, peak RSS 7887 MB on armB / 8025 MB on armC under 8 GB heap. Same as before. The two real wins of the parser changes still stand: - MS-level preload filter (drops MS1 binary decode entirely). - Defensive copy on read (closes calibrator-vs-main-pass mutation drift, removes the implicit invariant the MassCalibrator size guard was working around). Hard memory bounding for very-large-MS2 mzML (metaproteomic / DIA-style) is deferred. It needs accurate byte-offset tracking (current offsets are at StAX-internal-buffer granularity, not at element start) plus a seek-on-demand fallback so eviction can be backed by an O(1) re-parse rather than a null return. Tracking as a follow-up. Tests: - Removed testCacheSizeOverride (asserted the cap was bypassed, contradicting the bounded-cache claim). - Remaining parser tests (testCacheReturnsDefensiveCopy, testMSLevelPreloadFilter, plus the existing suite) still pass: mvn -B -o test -Dtest='TestStaxMzMLParser,...' → 88/88 passing. --- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 63 ++++--------------- .../java/msgfplus/TestStaxMzMLParser.java | 18 ------ 2 files changed, 12 insertions(+), 69 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index a7e0b89b..155de979 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -65,24 +65,24 @@ public static class SpectrumIndex { // Referenceable param groups: group ID -> list of [accession, name, value, unitAccession, unitName] private final Map> refParamGroups; - /** Sysprop overriding the LRU cache cap. Default {@link #DEFAULT_CACHE_SIZE}. */ - static final String CACHE_SIZE_PROPERTY = "msgfplus.mzml.cacheSize"; - /** Default cache cap (entries). For typical Astral / PXD MS2 counts the cap - * is never hit; for huge DIA / metaproteomic files it bounds heap. */ - static final int DEFAULT_CACHE_SIZE = 50_000; - /** MS-level filter from {@link StaxMzMLSpectraMap}. Spectra outside this * range are never decoded and never cached, so MS1 (typically the bulk of * mzML peaks) doesn't sit in heap during an MS2-only search. */ private final int minMSLevel; private final int maxMSLevel; - /** Bounded access-order LRU cache. Synchronized for concurrent - * preProcess / output-writer access. Returns DEFENSIVE COPIES on read so - * that pre-pass mutations (peak ranking, charge resolution, etc.) cannot - * leak to the main pass. */ + /** Synchronized cache of in-filter spectra. Returns DEFENSIVE COPIES on + * read so that pre-pass mutations (peak ranking, charge resolution, etc.) + * cannot leak to the main pass. Memory is bounded by the in-filter + * spectrum count after the MS-level filter — for an MS2-only search on a + * typical Orbitrap / Astral file that's ~25-75K Spectrum objects, well + * within heap budgets. Hard-capping (LRU eviction) requires a + * seek-on-demand fallback for cache misses; without that, evicting a + * spectrum would corrupt the search. Adding a real ceiling is a + * follow-up: needs accurate byte-offset tracking (current offsets are + * recorded at the StAX-internal-buffer granularity, not at element + * start) plus a re-parse path. */ private final Map cache; - private final int maxCacheSize; private volatile boolean allLoaded = false; // Reusable XMLInputFactory (thread-safe for creation) @@ -127,47 +127,8 @@ public StaxMzMLParser(File specFile, int minMSLevel, int maxMSLevel) throws IOEx this.indexBySpecIdx = new HashMap<>(); this.indexById = new HashMap<>(); this.refParamGroups = new HashMap<>(); - // Build the index first so we can size the cache based on the in-filter - // spectrum count. + this.cache = Collections.synchronizedMap(new HashMap<>()); buildIndex(); - // Effective cache cap: max(configured cap, in-filter spectrum count). - // The in-filter floor matters because there is no seek-on-demand - // fallback yet — if the cap evicted an in-filter spectrum and the - // caller later asked for it, we'd return null. This PR keeps the - // bounded-LRU plumbing but only enforces it when it can't break - // correctness; a future PR can add accurate byte-offset tracking + - // seek-on-demand and let the cap fire as a hard memory ceiling. - int configuredCap = resolveCacheSize(); - int inFilterCount = 0; - for (SpectrumIndex si : indexList) { - if (si.msLevel >= minMSLevel && si.msLevel <= maxMSLevel) inFilterCount++; - } - if (configuredCap < inFilterCount) { - System.out.println("StAX mzML cache cap " + configuredCap - + " is smaller than in-filter spectrum count " + inFilterCount - + "; raising effective cap to " + inFilterCount - + " (seek-on-demand fallback not yet implemented)."); - configuredCap = inFilterCount; - } - this.maxCacheSize = configuredCap; - final int cap = this.maxCacheSize; - this.cache = Collections.synchronizedMap(new LinkedHashMap(16, 0.75f, true) { - @Override - protected boolean removeEldestEntry(Map.Entry eldest) { - return size() > cap; - } - }); - } - - private static int resolveCacheSize() { - String v = System.getProperty(CACHE_SIZE_PROPERTY); - if (v != null) { - try { - int n = Integer.parseInt(v.trim()); - if (n > 0) return n; - } catch (NumberFormatException ignored) { /* fall through */ } - } - return DEFAULT_CACHE_SIZE; } // ----------------------------------------------------------------------- diff --git a/src/test/java/msgfplus/TestStaxMzMLParser.java b/src/test/java/msgfplus/TestStaxMzMLParser.java index c5cf95b3..69f58ed2 100644 --- a/src/test/java/msgfplus/TestStaxMzMLParser.java +++ b/src/test/java/msgfplus/TestStaxMzMLParser.java @@ -205,24 +205,6 @@ public void testMSLevelPreloadFilter() throws Exception { Assert.assertEquals(10, ms2.size()); } - @Test - public void testCacheSizeOverride() throws Exception { - // Bounded LRU cap is configurable via -Dmsgfplus.mzml.cacheSize. Verify - // a tiny cap still works end-to-end (eviction must not break correctness: - // a re-read of an evicted index just triggers another preload, which is - // an acceptable fallback for huge files). - String prev = System.getProperty("msgfplus.mzml.cacheSize"); - System.setProperty("msgfplus.mzml.cacheSize", "2"); - try { - StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile()); - Spectrum spec = parser.getSpectrumBySpecIndex(2); - Assert.assertNotNull(spec); - Assert.assertEquals(2, spec.getMSLevel()); - } finally { - if (prev == null) System.clearProperty("msgfplus.mzml.cacheSize"); - else System.setProperty("msgfplus.mzml.cacheSize", prev); - } - } @Test public void testIteratorWithMSLevelFilter() throws Exception { From 237baf5a0fe0cfde6ff799f67bb325f49027cec1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 13:40:30 +0100 Subject: [PATCH 09/10] docs(mzml): remove stale LRU/maxCacheSize references in StaxMzMLParser javadoc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 0e6539a removed the bounded LRU + cacheSize sysprop machinery but left two javadoc fragments still describing the deleted behavior: - getSpectrumBySpecIndex Javadoc said "Subsequent calls hit the bounded LRU cache" — the cache is now an unbounded synchronized HashMap. - preloadAllSpectra Javadoc said "If the cache cap ({@link #maxCacheSize}) is smaller than the in-filter spectrum count, the LRU eviction kicks in...". {@link #maxCacheSize} is a dangling reference (field deleted) and would fail strict javadoc generation; the eviction text contradicts the implementation. Rewrite both fragments to describe the actual unbounded-cache contract and point at the deferred follow-up (accurate byte-offset tracking + seek-on-demand) for the future hard memory ceiling. No code change. All scoped tests pass. --- .../java/edu/ucsd/msjava/mzml/StaxMzMLParser.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index 155de979..072b50f8 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -179,7 +179,7 @@ public Float getPrecursorMz(int specIndex) { * Parse and return the full spectrum (with peaks) for the given 1-based index. * *

    On first cache miss, performs a bulk preload of all spectra inside the - * MS-level filter. Subsequent calls hit the bounded LRU cache. On a hit, a + * MS-level filter. Subsequent calls hit the in-memory cache. On every hit, a * defensive copy is returned so caller mutations (peak ranking, charge * resolution, etc.) cannot leak to other readers. * @@ -214,11 +214,11 @@ public Spectrum getSpectrumBySpecIndex(int specIndex) { * {@code Peak} objects allocated). On Orbitrap / Astral mzML this drops * the bulk of the heap because MS1 is the peak-count-heavy tier. * - *

    If the cache cap ({@link #maxCacheSize}) is smaller than the in-filter - * spectrum count, the LRU eviction kicks in during the preload — eldest - * entries get dropped, making {@code allLoaded} mean "we attempted to - * preload everything" rather than "every spectrum is in cache." Subsequent - * misses still work (we just re-parse via the seek path). + *

    The cache is unbounded by design: every in-filter spectrum is + * retained for the lifetime of the parser. A hard memory ceiling is + * deferred to a follow-up that adds accurate byte-offset tracking + a + * seek-on-demand fallback so cache misses (after eviction) can re-parse + * a single spectrum cheaply. */ private synchronized void preloadAllSpectra() throws IOException, XMLStreamException { if (allLoaded) return; From 44e268196f7576df9e027f3324ee4e276d6194cf Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Sat, 25 Apr 2026 20:51:07 +0100 Subject: [PATCH 10/10] refactor: trim verbose javadocs and dead code in PR scope (-176 LOC) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code simplification pass over the eight files this PR touches. Algorithm + behavior unchanged; the same scoped test suite (88/88) still passes. What changed: - Trimmed verbose Javadoc / scaling-notes / PR-context paragraphs that restated the code. Kept single-line "what + non-obvious why" intent per the project's commenting policy. - Removed unused `firstSuffixIndex` field on `RangeMetadata` (never read after the inter-range LCP fixup was reduced to a single byte). - Inlined two single-use helpers: `cleanupOrphanedTempFiles` (one call site) and `ensureSearchStateInitialized` (one call site). Both were thin wrappers around three lines each. - Collapsed redundant `cache.get` lookups in `StaxMzMLParser.getSpectrumBySpecIndex` (one get + clone). - Removed duplicate `CvParamInfo` import + redundant fully-qualified refs in `cloneSpectrum`. - Test files: trimmed comment headers; assertions intact. Whitespace-ignoring diff (true semantic delta): +94 / -270 across 8 files (net -176 LOC). Deliberately not touched: - `compareSuffixesFrom` / `computeLcpByte` semantics (bit-identity gate). - `cloneSpectrum`'s set of copied fields (mirrors what `parseOneSpectrum` sets). - `Bucket` inner class internals (load-bearing for the parallel sort). - `.csarr` / `.cnlcp` header / footer / format-id. - The two-phase MS-level filter (pre-decode index check + post-decode belt-and-braces; the latter catches malformed mzML). - Pre-existing code outside this PR's diff. Tests: TestStaxMzMLParser, TestBuildSAParallelBitIdentity, TestConcurrentMSGFPlus, TestDirectPinWriter, TestMassCalibrator, TestPrecursorCalScaffolding, TestMSLevelFiltering, TestPercolator, TestSA, TestParsers, TestStaxMzMLParserErrorContext, TestPrimitiveRegression, TestMinSpectraPerThread → 88/88 passing. --- .../msdbsearch/CompactFastaSequence.java | 7 +- .../msjava/msdbsearch/CompactSuffixArray.java | 181 ++++-------- .../msjava/msdbsearch/ConcurrentMSGFPlus.java | 259 +++++++++--------- .../msjava/msdbsearch/MassCalibrator.java | 15 +- .../edu/ucsd/msjava/mzml/StaxMzMLParser.java | 105 ++----- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 68 ++--- .../TestBuildSAParallelBitIdentity.java | 31 +-- .../java/msgfplus/TestStaxMzMLParser.java | 12 +- 8 files changed, 251 insertions(+), 427 deletions(-) diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java index 68377563..886644b5 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactFastaSequence.java @@ -540,11 +540,8 @@ private FileSignature readSequence() { long lastModified = in.readLong(); sequence = new byte[size]; - // readFully: a plain in.read() is allowed to return fewer bytes than - // requested even on a BufferedInputStream-wrapped FileInputStream, which - // silently corrupts the in-memory sequence on large .cseq files (>2GB - // not yet reachable due to int-indexed `size`, but the short-read failure - // mode is real on any file whose payload exceeds the buffer's chunk). + // readFully: plain in.read() may return short on large .cseq files, + // silently corrupting the in-memory sequence. in.readFully(sequence); in.close(); diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java index e4c809fc..033b443b 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/CompactSuffixArray.java @@ -275,47 +275,18 @@ private int checkID() { return 0; } - /** Sysprop to override the number of threads used during the sort+LCP phase. */ + /** Sysprop overriding the number of threads used during the sort+LCP phase. */ static final String SA_BUILD_THREADS_PROPERTY = "msgfplus.buildsa.threads"; - /** Default bucket-range thread count cap. Higher values give diminishing returns and - * can thrash IO/caches; 4–8 is a reasonable upper bound. */ + /** Cap on default thread count: higher values give diminishing returns and thrash IO. */ private static final int MAX_DEFAULT_SA_BUILD_THREADS = 8; /** - * Helper method that creates the suffixFile. - * - *

    Algorithm: two-phase radix-then-sort. Each suffix is hashed by its - * first {@link #BUCKET_SIZE} residues into one of {@code alphabetSize^BUCKET_SIZE} - * buckets; within a bucket, suffixes are sorted lexicographically from offset - * {@code BUCKET_SIZE} onward (the bucket prefix is shared by construction). - * - *

    Scaling notes: - * - *

      - *
    • Buckets store raw {@code int[]} indices and are sorted in-place via - * {@link IntArrays#quickSort} with a comparator that reads from the - * sequence directly. Prior revisions materialised a - * {@code SuffixFactory.Suffix[]} per bucket before sorting; that - * allocated ~32 bytes of Java object overhead per suffix, which on a - * 100 MB+ FASTA added multiple GB of transient heap churn and tripped - * OOM on an 8 GB JVM. The int-based path stores 4 bytes per suffix - * and reuses the sequence directly during compare/LCP.
    • - *
    • LCP between adjacent suffixes is computed by an int-based helper - * on the sequence — no {@code ByteSequence} wrappers allocated in the - * sort/write loop.
    • - *
    • The sort + LCP phase is parallelised across contiguous bucket-id - * ranges (one worker thread per range). Each worker produces a - * thread-local buffer of sorted indices + intra-range LCPs; the - * merge step fixes up a single cross-range boundary LCP per - * range-to-range transition and streams the buffers into the final - * files sequentially. Writing stays single-threaded to preserve the - * on-disk suffix-array ordering.
    • - *
    - * - * @param sequence the Adapter object that represents the database (text). - * @param indexFile newly created index file. - * @param nlcpFile newly created nlcp file. + * Build the suffix-array index files. Two-phase radix-then-sort: each suffix + * is hashed by its first {@link #BUCKET_SIZE} residues into a bucket, then + * sorted lexicographically from offset {@code BUCKET_SIZE} onward. The + * sort+LCP phase is parallelised across contiguous bucket-id ranges; the + * write step is single-threaded to preserve on-disk ordering. */ private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFile, File nlcpFile) { System.out.println("Creating the suffix array indexed file... Size: " + sequence.getSize()); @@ -401,20 +372,13 @@ private void createSuffixArrayFiles(CompactFastaSequence sequence, File indexFil } /** - * Sort + LCP compute phase. Parallelises across contiguous bucket-id ranges - * (one per worker thread); writes a single interleaved stream of indices + - * LCP bytes to disk. Thread count is picked from - * {@link #SA_BUILD_THREADS_PROPERTY} if set, else - * {@code min(availableProcessors, MAX_DEFAULT_SA_BUILD_THREADS)}, else 1. - * - *

    Each worker streams its sorted indices + intra-range LCPs into a - * pair of per-range temp files (alongside the final {@code .csarr}). The - * merge step then walks ranges in bucket-id order, rewrites the first-LCP - * byte of each range against the previous range's last-bucket first suffix - * (a single fixup per range boundary), and streams the temp files into the - * output files. Temp files are deleted in a {@code finally} block on both - * success and failure paths; {@link File#deleteOnExit} is also requested - * as a belt-and-braces fallback for hard crashes. + * Sort + LCP compute phase. Parallelises across contiguous bucket-id + * ranges; each worker streams its sorted indices + intra-range LCPs into + * per-range temp files. The merge step fixes up the cross-range boundary + * LCP byte and streams the temp files into the final output sequentially + * (writing single-threaded preserves on-disk ordering). Temp files are + * deleted in the {@code finally} block, with {@link File#deleteOnExit} as + * a fallback for hard crashes. */ private static void sortAndWriteBuckets(CompactFastaSequence sequence, Bucket[] bucketSuffixes, @@ -424,9 +388,6 @@ private static void sortAndWriteBuckets(CompactFastaSequence sequence, int numThreads = resolveSortThreads(); int[][] ranges = partitionBucketIds(bucketSuffixes, numThreads); - // Single-thread fast path: stream directly to the output without - // materialising any per-range buffer. Matches the pre-parallel-refactor - // memory + CPU profile byte-for-byte on the sysprop-disabled path. if (ranges.length == 1) { writeBucketsDirect(sequence, bucketSuffixes, ranges[0][0], ranges[0][1], indexOut, nlcpOut); return; @@ -470,8 +431,6 @@ private static void sortAndWriteBuckets(CompactFastaSequence sequence, pool.shutdown(); } - // Merge: stream each range's temp files into the output, fixing up - // the cross-range boundary LCP byte at each transition. int prevRangeLastBucketFirst = -1; for (RangeMetadata md : rangeMetadatas) { if (md.numEntries == 0) continue; @@ -479,14 +438,15 @@ private static void sortAndWriteBuckets(CompactFastaSequence sequence, prevRangeLastBucketFirst = md.lastBucketFirstSuffix; } } finally { - // Best-effort cleanup of all temp files on both success and failure. for (RangeMetadata md : rangeMetadatas) { deleteQuietly(md.tempIndicesFile); deleteQuietly(md.tempLcpsFile); } - // Also pick up any temp files from workers that didn't finish (so - // their RangeMetadata never made it into the list). - cleanupOrphanedTempFiles(parentDir, tempBasename); + // Sweep debris from workers that died before returning a RangeMetadata. + File[] orphans = parentDir.listFiles((dir, name) -> name.startsWith(tempBasename)); + if (orphans != null) { + for (File f : orphans) deleteQuietly(f); + } } } @@ -496,23 +456,11 @@ private static void deleteQuietly(File f) { } /** - * Best-effort sweep of temp files matching {@code .*} in - * {@code parentDir}. Used in the {@code finally} block to catch debris - * from workers that died before returning a {@link RangeMetadata}. - */ - private static void cleanupOrphanedTempFiles(File parentDir, String tempBasename) { - if (parentDir == null || tempBasename == null) return; - File[] orphans = parentDir.listFiles((dir, name) -> name.startsWith(tempBasename)); - if (orphans == null) return; - for (File f : orphans) deleteQuietly(f); - } - - /** - * Stream one range's per-worker temp files into the final output streams. - * The first LCP byte is rewritten against {@code prevRangeLastBucketFirst} - * to bridge the cross-range boundary; the rest of the temp data is copied - * verbatim. Caller is responsible for deleting the temp files afterwards - * (handled in the {@code finally} block of {@link #sortAndWriteBuckets}). + * Stream one range's temp files into the final output. The first LCP byte + * is rewritten against {@code prevRangeLastBucketFirst} to bridge the + * cross-range boundary; for the globally-first range + * {@code prevRangeLastBucketFirst} is -1 and the placeholder 0 written by + * the worker passes through. */ private static void mergeRangeIntoOutput(CompactFastaSequence sequence, RangeMetadata md, @@ -521,9 +469,6 @@ private static void mergeRangeIntoOutput(CompactFastaSequence sequence, DataOutputStream nlcpOut) throws IOException { try (DataInputStream idxIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempIndicesFile))); DataInputStream lcpIn = new DataInputStream(new BufferedInputStream(new FileInputStream(md.tempLcpsFile)))) { - // First entry: rewrite LCP against the previous range's last-bucket - // first suffix. For the globally first range, prevRangeLastBucketFirst - // is -1 and we pass through the placeholder 0 written by the worker. int firstIndex = idxIn.readInt(); byte firstLcp = lcpIn.readByte(); if (prevRangeLastBucketFirst >= 0) { @@ -545,19 +490,15 @@ private static int resolveSortThreads() { try { int n = Integer.parseInt(configured.trim()); if (n > 0) return n; - } catch (NumberFormatException ignored) { - // fall through to default - } + } catch (NumberFormatException ignored) { } } int procs = Runtime.getRuntime().availableProcessors(); return Math.max(1, Math.min(procs, MAX_DEFAULT_SA_BUILD_THREADS)); } /** - * Split the bucket-id range {@code [0, buckets.length)} into contiguous - * ranges, one per thread. Ranges are balanced by total suffix count (sum - * of {@code bucket.size}) so each worker has roughly the same amount of - * sort + LCP work rather than the same number of hash buckets. + * Split bucket ids into contiguous ranges balanced by total suffix count + * (so each worker has roughly equal sort+LCP work, not equal bucket count). */ private static int[][] partitionBucketIds(Bucket[] buckets, int numThreads) { if (numThreads <= 1 || buckets.length == 0) { @@ -586,7 +527,6 @@ private static int[][] partitionBucketIds(Bucket[] buckets, int numThreads) { } } ranges[rangeIdx++] = new int[]{rangeStart, buckets.length}; - // Trim to actual range count. if (rangeIdx != numThreads) { int[][] trimmed = new int[rangeIdx][]; System.arraycopy(ranges, 0, trimmed, 0, rangeIdx); @@ -596,15 +536,11 @@ private static int[][] partitionBucketIds(Bucket[] buckets, int numThreads) { } /** - * Process a contiguous bucket-id range: sort each bucket, compute intra-range - * LCPs, and stream the output into per-worker temp files. The first LCP byte - * is a placeholder (0) to be fixed up during merge against the previous - * range's last bucket. - * - *

    Each bucket reference is released as soon as its int[] has been sorted, - * keeping peak heap bounded by the largest in-flight bucket per thread. - * No per-range int[] / byte[] is materialised — disk-backed instead, so peak - * heap during the parallel sort phase does not scale with FASTA size. + * Sort each bucket in the range, compute intra-range LCPs, and stream the + * output into per-worker temp files. The first LCP byte is a placeholder + * (0) — the merge step rewrites it against the previous range's last + * bucket. Each bucket's storage is released as soon as it is sorted, so + * peak heap is bounded by the largest in-flight bucket per thread. */ private static RangeMetadata processBucketRangeToTempFiles(CompactFastaSequence sequence, Bucket[] buckets, @@ -617,18 +553,17 @@ private static RangeMetadata processBucketRangeToTempFiles(CompactFastaSequence if (buckets[i] != null) count += buckets[i].size; } if (count == 0L) { - // No work to do: return a sentinel; no temp files were created. - return new RangeMetadata(null, null, 0, -1, -1); + return new RangeMetadata(null, null, 0, -1); } if (count > Integer.MAX_VALUE) { throw new IllegalStateException("Suffix array bucket range exceeds Integer.MAX_VALUE entries"); } - int firstSuffixIndex = -1; int lastBucketFirstSuffix = -1; int prevIntraBucketLast = -1; int prevBucketFirst = -1; int numEntries = 0; + boolean firstBucketSeen = false; try (DataOutputStream idxOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempIndicesFile))); DataOutputStream lcpOut = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tempLcpsFile)))) { @@ -637,23 +572,15 @@ private static RangeMetadata processBucketRangeToTempFiles(CompactFastaSequence if (bucket == null) continue; int[] sorted = bucket.trimmedArray(); - buckets[bucketId] = null; // release + buckets[bucketId] = null; IntArrays.quickSort(sorted, (a, b) -> compareSuffixesFrom(sequence, a, b, BUCKET_SIZE)); int first = sorted[0]; idxOut.writeInt(first); - byte lcp; - if (firstSuffixIndex < 0) { - firstSuffixIndex = first; - // Placeholder — merge step rewrites this based on the previous - // range's last-bucket first suffix. For the globally-first - // bucket this stays 0. - lcp = 0; - } else { - lcp = computeLcpByte(sequence, first, prevBucketFirst, 0); - } + byte lcp = firstBucketSeen ? computeLcpByte(sequence, first, prevBucketFirst, 0) : 0; lcpOut.writeByte(lcp); numEntries++; + firstBucketSeen = true; prevIntraBucketLast = first; for (int j = 1; j < sorted.length; j++) { @@ -669,14 +596,13 @@ private static RangeMetadata processBucketRangeToTempFiles(CompactFastaSequence } } - return new RangeMetadata(tempIndicesFile, tempLcpsFile, numEntries, firstSuffixIndex, lastBucketFirstSuffix); + return new RangeMetadata(tempIndicesFile, tempLcpsFile, numEntries, lastBucketFirstSuffix); } /** - * Single-thread direct-write path. Sorts each bucket, computes LCPs, and - * writes to disk in one pass — no thread-local buffer, no merge, no - * executor. Used when {@link #SA_BUILD_THREADS_PROPERTY} resolves to 1 - * (typically for deterministic testing or low-core machines). + * Single-thread direct-write path: sort each bucket, compute LCPs, and + * write to disk in one pass. Used when {@link #SA_BUILD_THREADS_PROPERTY} + * resolves to 1. */ private static void writeBucketsDirect(CompactFastaSequence sequence, Bucket[] buckets, @@ -719,22 +645,19 @@ private static void writeBucketsDirect(CompactFastaSequence sequence, } } - /** Per-worker sort + LCP output handle. The actual indices/LCPs live on disk - * in {@link #tempIndicesFile} / {@link #tempLcpsFile}; this struct just - * carries the small metadata the merge step needs. Workers with no entries - * in their range create no temp files and return {@code null} paths. */ + /** Per-worker sort+LCP output handle. Indices/LCPs live on disk; this carries + * the small metadata the merge step needs. Empty ranges return {@code null} + * file paths. */ static final class RangeMetadata { final File tempIndicesFile; final File tempLcpsFile; final int numEntries; - final int firstSuffixIndex; final int lastBucketFirstSuffix; - RangeMetadata(File tempIndicesFile, File tempLcpsFile, int numEntries, int firstSuffixIndex, int lastBucketFirstSuffix) { + RangeMetadata(File tempIndicesFile, File tempLcpsFile, int numEntries, int lastBucketFirstSuffix) { this.tempIndicesFile = tempIndicesFile; this.tempLcpsFile = tempLcpsFile; this.numEntries = numEntries; - this.firstSuffixIndex = firstSuffixIndex; this.lastBucketFirstSuffix = lastBucketFirstSuffix; } } @@ -767,11 +690,9 @@ int[] trimmedArray() { } /** - * Compare two suffixes of {@code sequence} starting at the given offset. Sign - * semantics match {@link Comparable#compareTo}; magnitude is not preserved - * (we only need the sort order, not the LCP — that's computed separately). - * Comparison length is capped at {@link ByteSequence#MAX_COMPARISON_LENGTH} - * to match the legacy {@code Suffix.compareTo} behaviour. + * Compare two suffixes of {@code sequence} starting at the given offset. + * Sign semantics match {@link Comparable#compareTo} and {@link ByteSequence#compareTo}; + * magnitude is not preserved. */ private static int compareSuffixesFrom(CompactFastaSequence sequence, int idxA, int idxB, int startOffset) { if (idxA == idxB) return 0; @@ -791,11 +712,7 @@ private static int compareSuffixesFrom(CompactFastaSequence sequence, int idxA, return Long.compare(remainA, remainB); } - /** - * Compute the LCP (longest common prefix length) of two suffixes starting - * from {@code startOffset}. Capped at {@link Byte#MAX_VALUE} so the result - * fits in a single byte for on-disk storage. - */ + /** LCP of two suffixes starting from {@code startOffset}, capped at {@link Byte#MAX_VALUE}. */ private static byte computeLcpByte(CompactFastaSequence sequence, int idxA, int idxB, int startOffset) { long seqSize = sequence.getSize(); long remainA = seqSize - idxA; diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java index 80ec8135..4afcd8a5 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/ConcurrentMSGFPlus.java @@ -1,8 +1,8 @@ -package edu.ucsd.msjava.msdbsearch; - -import edu.ucsd.msjava.misc.ProgressData; -import edu.ucsd.msjava.misc.ProgressReporter; - +package edu.ucsd.msjava.msdbsearch; + +import edu.ucsd.msjava.misc.ProgressData; +import edu.ucsd.msjava.misc.ProgressReporter; + import java.io.PrintStream; import java.util.List; import java.util.function.Supplier; @@ -19,17 +19,17 @@ public static class RunMSGFPlus implements Runnable, ProgressReporter { private ProgressData progress; private ScoredSpectraMap specScanner; private DBScanner scanner; - - @Override - public void setProgressData(ProgressData data) { - progress = data; - } - - @Override - public ProgressData getProgressData() { - return progress; - } - + + @Override + public void setProgressData(ProgressData data) { + progress = data; + } + + @Override + public ProgressData getProgressData() { + return progress; + } + public RunMSGFPlus( Supplier specScannerSupplier, CompactSuffixArray sa, @@ -45,128 +45,123 @@ public RunMSGFPlus( progress = null; } - private void ensureSearchStateInitialized() { - if (specScanner != null) { - return; - } - specScanner = specScannerSupplier.get(); - scanner = new DBScanner( - specScanner, - sa, - params.getEnzyme(), - params.getAASet(), - params.getNumMatchesPerSpec(), - params.getMinPeptideLength(), - params.getMaxPeptideLength(), - params.getMaxNumVariantsPerPeptide(), - params.getMinDeNovoScore(), - params.ignoreMetCleavage(), - params.getMaxMissedCleavages() - ); - } - @Override public void run() { if (progress == null) { progress = new ProgressData(); } - ensureSearchStateInitialized(); + if (specScanner == null) { + specScanner = specScannerSupplier.get(); + scanner = new DBScanner( + specScanner, + sa, + params.getEnzyme(), + params.getAASet(), + params.getNumMatchesPerSpec(), + params.getMinPeptideLength(), + params.getMaxPeptideLength(), + params.getMaxNumVariantsPerPeptide(), + params.getMinDeNovoScore(), + params.ignoreMetCleavage(), + params.getMaxMissedCleavages() + ); + } PrintStream output; if (params.getVerbose()) { - output = System.out; - } else { - output = new PrintStream(new NullOutputStream()); - } - - progress.stepRange(5.0); - String threadName = Thread.currentThread().getName(); - output.println(threadName + ": Starting task " + taskNum); - - specScanner.setProgressObj(new ProgressData(progress)); - - // Pre-process spectra - long startTimePreprocess = System.currentTimeMillis(); - if (Thread.currentThread().isInterrupted()) { - return; - } - - if (specScanner.getPepMassSpecKeyMap().size() == 0) - specScanner.makePepMassSpecKeyMap(); - - output.println(threadName + ": Preprocessing spectra..."); - if (Thread.currentThread().isInterrupted()) { - return; - } - specScanner.preProcessSpectra(); - if (Thread.currentThread().isInterrupted()) { - return; - } - output.print(threadName + ": Preprocessing spectra finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimePreprocess) / 1000)); - - specScanner.getProgressObj().setParentProgressObj(null); - progress.report(5.0); - progress.stepRange(80.0); - scanner.setProgressObj(new ProgressData(progress)); - - long startTimeDbSearch = System.currentTimeMillis(); - - // DB search - output.println(threadName + ": Database search..."); - scanner.setThreadName(threadName); - scanner.setPrintStream(output); - - int ntt = params.getNumTolerableTermini(); - if (params.getEnzyme() == null) - ntt = 0; - int nnet = 2 - ntt; - if (Thread.currentThread().isInterrupted()) { - return; - } - scanner.dbSearch(nnet); - if (Thread.currentThread().isInterrupted()) { - return; - } - output.print(threadName + ": Database search finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeDbSearch) / 1000)); - - progress.stepRange(95.0); - - long startTimeComputeEvalue = System.currentTimeMillis(); - output.println(threadName + ": Computing spectral E-values..."); - if (Thread.currentThread().isInterrupted()) { - return; - } - scanner.computeSpecEValue(false); - if (Thread.currentThread().isInterrupted()) { - return; - } - output.print(threadName + ": Computing spectral E-values finished "); - output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeComputeEvalue) / 1000)); - - scanner.getProgressObj().setParentProgressObj(null); - progress.stepRange(100); - - if (Thread.currentThread().isInterrupted()) { - return; - } - - scanner.generateSpecIndexDBMatchMap(); - - progress.report(30.0); - - if (params.outputAdditionalFeatures()) - scanner.addAdditionalFeatures(); - - progress.report(60.0); - - scanner.addResultsToList(resultList); - - progress.report(100.0); -// gen.addSpectrumIdentificationResults(scanner.getSpecIndexDBMatchMap()); - output.println(threadName + ": Task " + taskNum + " completed."); - } - } -} + output = System.out; + } else { + output = new PrintStream(new NullOutputStream()); + } + + progress.stepRange(5.0); + String threadName = Thread.currentThread().getName(); + output.println(threadName + ": Starting task " + taskNum); + + specScanner.setProgressObj(new ProgressData(progress)); + + // Pre-process spectra + long startTimePreprocess = System.currentTimeMillis(); + if (Thread.currentThread().isInterrupted()) { + return; + } + + if (specScanner.getPepMassSpecKeyMap().size() == 0) + specScanner.makePepMassSpecKeyMap(); + + output.println(threadName + ": Preprocessing spectra..."); + if (Thread.currentThread().isInterrupted()) { + return; + } + specScanner.preProcessSpectra(); + if (Thread.currentThread().isInterrupted()) { + return; + } + output.print(threadName + ": Preprocessing spectra finished "); + output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimePreprocess) / 1000)); + + specScanner.getProgressObj().setParentProgressObj(null); + progress.report(5.0); + progress.stepRange(80.0); + scanner.setProgressObj(new ProgressData(progress)); + + long startTimeDbSearch = System.currentTimeMillis(); + + // DB search + output.println(threadName + ": Database search..."); + scanner.setThreadName(threadName); + scanner.setPrintStream(output); + + int ntt = params.getNumTolerableTermini(); + if (params.getEnzyme() == null) + ntt = 0; + int nnet = 2 - ntt; + if (Thread.currentThread().isInterrupted()) { + return; + } + scanner.dbSearch(nnet); + if (Thread.currentThread().isInterrupted()) { + return; + } + output.print(threadName + ": Database search finished "); + output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeDbSearch) / 1000)); + + progress.stepRange(95.0); + + long startTimeComputeEvalue = System.currentTimeMillis(); + output.println(threadName + ": Computing spectral E-values..."); + if (Thread.currentThread().isInterrupted()) { + return; + } + scanner.computeSpecEValue(false); + if (Thread.currentThread().isInterrupted()) { + return; + } + output.print(threadName + ": Computing spectral E-values finished "); + output.format("(elapsed time: %.2f sec)\n", (float) ((System.currentTimeMillis() - startTimeComputeEvalue) / 1000)); + + scanner.getProgressObj().setParentProgressObj(null); + progress.stepRange(100); + + if (Thread.currentThread().isInterrupted()) { + return; + } + + scanner.generateSpecIndexDBMatchMap(); + + progress.report(30.0); + + if (params.outputAdditionalFeatures()) + scanner.addAdditionalFeatures(); + + progress.report(60.0); + + scanner.addResultsToList(resultList); + + progress.report(100.0); +// gen.addSpectrumIdentificationResults(scanner.getSpecIndexDBMatchMap()); + output.println(threadName + ": Task " + taskNum + " completed."); + } + } +} diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java b/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java index 652f034e..24817f94 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/MassCalibrator.java @@ -119,20 +119,7 @@ public MassCalibrator( * @return learned ppm shift, or 0.0 if the pre-pass had insufficient data */ public double learnPrecursorShiftPpm(int ioIndex) { - // Size guard: skip the pre-pass on small files where it can't yield - // MIN_CONFIDENT_PSMS reliably. - // - // Historical context (kept here so the threshold isn't trimmed by accident): - // the 10_000 SpecKey floor was originally also a workaround for state- - // mutation drift — the StaxMzMLParser cache returned the SAME Spectrum - // instance to the pre-pass and main pass, and preProcessSpectra mutated - // peak ranks / charge in-place, causing ~0.1% PSM-list drift vs - // -precursorCal off. As of the big-FASTA / Astral-OOM PR - // (feature/improve-mzid-suffix-big-fasta), the parser returns a defensive - // copy on every getSpectrumBySpecIndex, so the mutation pathway is closed. - // The threshold remains as a "is there enough data to learn from" gate; - // it is no longer a correctness shield. If the calibrator ever needs to - // run on smaller files, this floor can be lowered safely. + // Skip the pre-pass on small files where MIN_CONFIDENT_PSMS can't be reached. if (specKeyList == null || specKeyList.size() < MIN_SPECKEYS_FOR_PREPASS) { return 0.0; } diff --git a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java index 072b50f8..12b15475 100644 --- a/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java +++ b/src/main/java/edu/ucsd/msjava/mzml/StaxMzMLParser.java @@ -9,8 +9,6 @@ import ch.qos.logback.classic.Logger; import ch.qos.logback.classic.LoggerContext; -import edu.ucsd.msjava.msutil.CvParamInfo; - import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; @@ -65,23 +63,12 @@ public static class SpectrumIndex { // Referenceable param groups: group ID -> list of [accession, name, value, unitAccession, unitName] private final Map> refParamGroups; - /** MS-level filter from {@link StaxMzMLSpectraMap}. Spectra outside this - * range are never decoded and never cached, so MS1 (typically the bulk of - * mzML peaks) doesn't sit in heap during an MS2-only search. */ + /** MS-level filter: spectra outside this range are never decoded or cached. */ private final int minMSLevel; private final int maxMSLevel; - /** Synchronized cache of in-filter spectra. Returns DEFENSIVE COPIES on - * read so that pre-pass mutations (peak ranking, charge resolution, etc.) - * cannot leak to the main pass. Memory is bounded by the in-filter - * spectrum count after the MS-level filter — for an MS2-only search on a - * typical Orbitrap / Astral file that's ~25-75K Spectrum objects, well - * within heap budgets. Hard-capping (LRU eviction) requires a - * seek-on-demand fallback for cache misses; without that, evicting a - * spectrum would corrupt the search. Adding a real ceiling is a - * follow-up: needs accurate byte-offset tracking (current offsets are - * recorded at the StAX-internal-buffer granularity, not at element - * start) plus a re-parse path. */ + /** Synchronized cache of in-filter spectra. Returns defensive copies on read + * so pre-pass mutations cannot leak to the main pass. */ private final Map cache; private volatile boolean allLoaded = false; @@ -96,13 +83,9 @@ public static class SpectrumIndex { } /** - * Construct a parser for the given mzML file with no MS-level filter - * (caches every spectrum). Immediately builds the spectrum index. - * - *

    Prefer the {@link #StaxMzMLParser(File, int, int)} overload — passing - * the MS-level filter through to the parser is what lets MS1 spectra be - * skipped entirely during the binary-decode preload, which is the dominant - * heap saving on Orbitrap / Astral mzML files. + * Construct a parser for the given mzML file with no MS-level filter. + * Prefer {@link #StaxMzMLParser(File, int, int)} so MS1 spectra can be + * skipped during the binary-decode preload. */ public StaxMzMLParser(File specFile) throws IOException, XMLStreamException { this(specFile, 1, Integer.MAX_VALUE); @@ -112,12 +95,6 @@ public StaxMzMLParser(File specFile) throws IOException, XMLStreamException { * Construct a parser for the given mzML file, decoding/caching only spectra * with MS level inside {@code [minMSLevel, maxMSLevel]}. Immediately builds * the spectrum index (single sequential pass; no peak decode). - * - *

    Caveat: if a future feature needs the parent MS1 of an MS2 scan - * (precursor refinement à la MaxQuant, chimeric-spectrum analysis, etc.), - * widen this filter or add a separate MS1-only accessor — do not silently - * widen the filter here, which would re-introduce the OOM regression this - * preload-filter is fixing. */ public StaxMzMLParser(File specFile, int minMSLevel, int maxMSLevel) throws IOException, XMLStreamException { this.specFile = specFile; @@ -177,48 +154,29 @@ public Float getPrecursorMz(int specIndex) { /** * Parse and return the full spectrum (with peaks) for the given 1-based index. - * - *

    On first cache miss, performs a bulk preload of all spectra inside the - * MS-level filter. Subsequent calls hit the in-memory cache. On every hit, a - * defensive copy is returned so caller mutations (peak ranking, charge - * resolution, etc.) cannot leak to other readers. - * - *

    Returns {@code null} for unknown indices and for spectra outside the - * configured MS-level filter (matching the existing {@link StaxMzMLSpectraMap} - * contract). + * On first cache miss, performs a bulk preload of all in-filter spectra; every + * subsequent call returns a defensive copy from the cache. Returns {@code null} + * for unknown indices and for spectra outside the configured MS-level filter. */ public Spectrum getSpectrumBySpecIndex(int specIndex) { SpectrumIndex si = indexBySpecIdx.get(specIndex); if (si == null) return null; if (si.msLevel < minMSLevel || si.msLevel > maxMSLevel) return null; - if (!allLoaded) { - Spectrum cached = cache.get(specIndex); - if (cached == null) { - try { - preloadAllSpectra(); - } catch (Exception e) { - throw new RuntimeException("Failed to preload spectra while retrieving spectrum index " + specIndex, e); - } + if (!allLoaded && !cache.containsKey(specIndex)) { + try { + preloadAllSpectra(); + } catch (Exception e) { + throw new RuntimeException("Failed to preload spectra while retrieving spectrum index " + specIndex, e); } } - Spectrum cached = cache.get(specIndex); - return cloneSpectrum(cached); + return cloneSpectrum(cache.get(specIndex)); } /** - * Bulk preload pass: walks the file once and inserts every in-filter - * spectrum into the cache. Spectra with MS level outside - * {@code [minMSLevel, maxMSLevel]} are skipped without binary decode (the - * {@code } element is advanced past, no {@link Spectrum} or - * {@code Peak} objects allocated). On Orbitrap / Astral mzML this drops - * the bulk of the heap because MS1 is the peak-count-heavy tier. - * - *

    The cache is unbounded by design: every in-filter spectrum is - * retained for the lifetime of the parser. A hard memory ceiling is - * deferred to a follow-up that adds accurate byte-offset tracking + a - * seek-on-demand fallback so cache misses (after eviction) can re-parse - * a single spectrum cheaply. + * Walk the file once and cache every in-filter spectrum. Out-of-filter + * spectra are skipped without binary decode — no {@link Spectrum} or + * {@code Peak} objects allocated. */ private synchronized void preloadAllSpectra() throws IOException, XMLStreamException { if (allLoaded) return; @@ -230,9 +188,8 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep while (reader.hasNext()) { int event = reader.next(); if (event == XMLStreamConstants.START_ELEMENT && "spectrum".equals(reader.getLocalName())) { - // Look up MS level from the pre-built index to decide whether to - // pay the binary-decode cost. We can't trust the element's - // index attribute alone here; use the id attribute that the index built. + // Skip out-of-filter spectra without binary decode by consulting + // the pre-built index via the spectrum's id attribute. String id = reader.getAttributeValue(null, "id"); SpectrumIndex si = id != null ? indexById.get(id) : null; if (si != null && (si.msLevel < minMSLevel || si.msLevel > maxMSLevel)) { @@ -244,8 +201,7 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep if (spec != null) { int ms = spec.getMSLevel(); if (ms < minMSLevel || ms > maxMSLevel) { - // Belt-and-braces: index lookup didn't catch it (rare — id - // mismatch on malformed mzML), drop it post-parse. + // Index lookup missed (malformed mzML id mismatch); drop post-parse. skipped++; continue; } @@ -266,15 +222,8 @@ private synchronized void preloadAllSpectra() throws IOException, XMLStreamExcep } /** - * Defensive copy of a cached {@link Spectrum}. The cache is shared across - * pre-pass (e.g. {@link edu.ucsd.msjava.msdbsearch.MassCalibrator}) and main - * pass; both can mutate the returned Spectrum (peak ranking via - * {@link edu.ucsd.msjava.msutil.Peak#setRank}, charge resolution, peak - * filtering). Returning a fresh {@link Spectrum} with cloned {@link Peak}s - * preserves the master parser's "every read is a fresh object" semantics - * and removes the silent state-mutation drift that - * {@link edu.ucsd.msjava.msdbsearch.MassCalibrator}'s 10K-SpecKey gate is - * currently working around. + * Defensive copy of a cached {@link Spectrum}. Mirrors the field set + * populated by {@link #parseOneSpectrum}; keep the two in lock-step. */ private static Spectrum cloneSpectrum(Spectrum src) { if (src == null) return null; @@ -289,15 +238,15 @@ private static Spectrum cloneSpectrum(Spectrum src) { dst.setRtIsSeconds(src.getRtIsSeconds()); dst.setIsolationWindowTargetMz(src.getIsolationWindowTargetMz()); if (src.getPrecursorPeak() != null) { - edu.ucsd.msjava.msutil.Peak p = src.getPrecursorPeak(); - dst.setPrecursor(new edu.ucsd.msjava.msutil.Peak(p.getMz(), p.getIntensity(), p.getCharge())); + Peak p = src.getPrecursorPeak(); + dst.setPrecursor(new Peak(p.getMz(), p.getIntensity(), p.getCharge())); } if (src.getActivationMethod() != null) dst.setActivationMethod(src.getActivationMethod()); if (src.getAddlCvParams() != null) { for (CvParamInfo cv : src.getAddlCvParams()) dst.addAddlCvParam(cv); } - for (edu.ucsd.msjava.msutil.Peak p : src) { - dst.add(new edu.ucsd.msjava.msutil.Peak(p.getMz(), p.getIntensity(), p.getCharge())); + for (Peak p : src) { + dst.add(new Peak(p.getMz(), p.getIntensity(), p.getCharge())); } return dst; } diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index c3bdc7d0..3a10c9bc 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -421,40 +421,40 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o } } - try { - for (int i = 0; i < numTasks; i++) { - final int taskStartIndex = startIndex[i]; - final int taskEndIndex = endIndex[i]; - final boolean storeRankScorer = params.outputAdditionalFeatures(); - final int taskNum = i + 1; - - ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( - () -> { - // Build task-local spectrum state only when a worker thread - // actually starts running to avoid queueing all task heaps. - ScoredSpectraMap specScanner = new ScoredSpectraMap( - specAcc, - Collections.synchronizedList(specKeyList.subList(taskStartIndex, taskEndIndex)), - leftPrecursorMassTolerance, - rightPrecursorMassTolerance, - minIsotopeError, - maxIsotopeError, - specDataType, - storeRankScorer, - false, - precursorMassShiftPpm - ); - if (doNotUseEdgeScore) - specScanner.turnOffEdgeScoring(); - return specScanner; - }, - sa, - params, - resultList, - taskNum - ); - - if (DISABLE_THREADING) { + try { + for (int i = 0; i < numTasks; i++) { + final int taskStartIndex = startIndex[i]; + final int taskEndIndex = endIndex[i]; + final boolean storeRankScorer = params.outputAdditionalFeatures(); + final int taskNum = i + 1; + + // Defer ScoredSpectraMap construction to the worker thread so all + // tasks' spectrum heaps aren't allocated up front when queued. + ConcurrentMSGFPlus.RunMSGFPlus msgfplusExecutor = new ConcurrentMSGFPlus.RunMSGFPlus( + () -> { + ScoredSpectraMap specScanner = new ScoredSpectraMap( + specAcc, + Collections.synchronizedList(specKeyList.subList(taskStartIndex, taskEndIndex)), + leftPrecursorMassTolerance, + rightPrecursorMassTolerance, + minIsotopeError, + maxIsotopeError, + specDataType, + storeRankScorer, + false, + precursorMassShiftPpm + ); + if (doNotUseEdgeScore) + specScanner.turnOffEdgeScoring(); + return specScanner; + }, + sa, + params, + resultList, + taskNum + ); + + if (DISABLE_THREADING) { msgfplusExecutor.run(); } else { executor.execute(msgfplusExecutor); diff --git a/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java b/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java index c3498a8f..9e06abbf 100644 --- a/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java +++ b/src/test/java/msgfplus/TestBuildSAParallelBitIdentity.java @@ -12,14 +12,9 @@ import java.nio.file.StandardCopyOption; /** - * BuildSA bit-identity test: the parallel temp-file path must produce - * byte-identical .csarr/.cnlcp output to the single-thread direct-write path - * past the 8-byte header (size + id; the id changes per-build via - * non-deterministic UUID-hash in CompactFastaSequence). - * - *

    Runs the same FASTA through both paths, captures the resulting bytes, - * and verifies they match. Also verifies that the parallel path leaves no - * temp files behind on success. + * Bit-identity test: the parallel sort path must produce byte-identical + * .csarr/.cnlcp output to the single-thread path between the header and footer + * (header id and footer mtime are non-deterministic between builds). */ public class TestBuildSAParallelBitIdentity { @@ -60,7 +55,6 @@ public void parallelMatchesSingleThreadByteForByte() throws Exception { Assert.assertArrayEquals(".csarr post-header bytes must be identical between N=1 and N=4", singleCsarr, parallelCsarr); Assert.assertArrayEquals(".cnlcp post-header bytes must be identical between N=1 and N=4", singleCnlcp, parallelCnlcp); - // No temp debris left behind in the parallel build's directory. File parentDir = parallelArtifacts.getAbsoluteFile().getParentFile(); File[] debris = parentDir.listFiles((dir, name) -> name.contains(".buildsa-tmp.")); Assert.assertNotNull(debris); @@ -72,11 +66,7 @@ public void parallelMatchesSingleThreadByteForByte() throws Exception { } } - /** - * Copies the {@link #FIXTURE} into a fresh temp directory so we can build - * {@code .canno / .cseq / .csarr / .cnlcp} alongside it without polluting - * {@code src/test/resources}. - */ + /** Copies the FASTA fixture into a fresh temp dir so build artifacts don't pollute test resources. */ private static File stageFastaIntoTempDir(String prefix) throws Exception { Path tempDir = Files.createTempDirectory(prefix); File source = new File(TestBuildSAParallelBitIdentity.class.getClassLoader().getResource(FIXTURE).toURI()); @@ -86,17 +76,14 @@ private static File stageFastaIntoTempDir(String prefix) throws Exception { } /** - * Read the file and skip both the 8-byte header (size int + id int — the id - * is non-deterministic UUID-hash) and the 12-byte footer (lastModified long - * + formatId int — the lastModified differs because the test stages the - * fixture into a fresh temp dir per run, so the FASTA's mtime differs - * between the two runs). The bytes between are the actual sort output we - * want to compare bit-for-bit. + * Read the file with the 8-byte header (size + id) and 12-byte footer + * (lastModified + formatId) trimmed off. Both are non-deterministic between + * runs; the body in between is the actual sort output to compare. */ private static byte[] readBodyBytes(File f) throws IOException { byte[] all = Files.readAllBytes(f.toPath()); - int headerSize = 8; // int size + int id - int footerSize = 8 + 4; // long lastModified + int formatId + int headerSize = 8; + int footerSize = 8 + 4; Assert.assertTrue("Output file too small: " + f, all.length >= headerSize + footerSize); int bodyLen = all.length - headerSize - footerSize; byte[] body = new byte[bodyLen]; diff --git a/src/test/java/msgfplus/TestStaxMzMLParser.java b/src/test/java/msgfplus/TestStaxMzMLParser.java index 69f58ed2..a0924ba2 100644 --- a/src/test/java/msgfplus/TestStaxMzMLParser.java +++ b/src/test/java/msgfplus/TestStaxMzMLParser.java @@ -169,22 +169,16 @@ public void testGetSpectrumById() throws Exception { @Test public void testCacheReturnsDefensiveCopy() throws Exception { - // Behavioral change vs. the original parser: getSpectrumBySpecIndex - // now returns a defensive copy on every call. This is required so the - // pre-pass (MassCalibrator) cannot mutate scoring state that the main - // pass will later re-read. See `cloneSpectrum` in StaxMzMLParser. StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile()); Spectrum spec1 = parser.getSpectrumBySpecIndex(2); Spectrum spec2 = parser.getSpectrumBySpecIndex(2); Assert.assertNotSame("Defensive copy should return distinct instances", spec1, spec2); - // Equivalent data: same id, same peak count, same precursor m/z Assert.assertEquals(spec1.getID(), spec2.getID()); Assert.assertEquals(spec1.size(), spec2.size()); Assert.assertEquals(spec1.getPrecursorPeak().getMz(), spec2.getPrecursorPeak().getMz(), 0.0001f); - // Mutating one copy must not affect the other (or any future read). - // Set the rank of the first peak in spec1; verify spec2 / spec3 are unaffected. + // Mutation on one copy must not leak to a future read. spec1.get(0).setRank(99); Spectrum spec3 = parser.getSpectrumBySpecIndex(2); Assert.assertNotSame(spec1, spec3); @@ -193,8 +187,7 @@ public void testCacheReturnsDefensiveCopy() throws Exception { @Test public void testMSLevelPreloadFilter() throws Exception { - // Parser constructed with [2, 2] should never decode/return MS1 spectra. - // The tiny.pwiz.mzML fixture has 3 MS1 (indices 1, 3, 4) and 1 MS2 (index 2). + // tiny.pwiz.mzML has MS1 at indices 1, 3, 4 and MS2 at index 2. StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile(), 2, 2); Assert.assertNull("MS1 (index 1) must be filtered out", parser.getSpectrumBySpecIndex(1)); Assert.assertNull("MS1 (index 3) must be filtered out", parser.getSpectrumBySpecIndex(3)); @@ -205,7 +198,6 @@ public void testMSLevelPreloadFilter() throws Exception { Assert.assertEquals(10, ms2.size()); } - @Test public void testIteratorWithMSLevelFilter() throws Exception { StaxMzMLParser parser = new StaxMzMLParser(getMzMLFile());