From 7f313b3d1943c80e7675769e57daf04573ffd6fa Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Tue, 23 Jun 2026 14:17:47 +0200 Subject: [PATCH 1/3] fix(glob): bound gitignore matching memory to prevent scan OOM socket fix and socket scan aborted with "FATAL ERROR: CALL_AND_RETRY_LAST ... heap out of memory" (SIGABRT) on large monorepos. globWithGitIgnore discovers every nested .gitignore and unions their patterns; the non-negated code path handed that whole set to fast-glob's native ignore option. fast-glob re-compiles and re-tests its entire ignore array inside each directory scan, so a set of tens of thousands of patterns exhausts V8 code space, which raising --max-old-space-size does not relieve. Route the high-cardinality gitignore set through a single reused ignore instance (which compiles each rule once and memoizes it) and hand fast-glob only the small bounded set it needs to prune directories during the walk. The negated-pattern path already worked this way; this unifies both paths and removes the asymmetry that left the common case crashing. Add a regression test that builds a 100k-pattern nested-.gitignore tree and asserts the walk completes with the correct manifests, and correct a comment in getPackageFilesForScan that overstated what the streaming filter prevents. --- src/utils/glob-oom.test.mts | 72 +++++++++++++++++++++++++++++++++++++ src/utils/glob.mts | 59 +++++++++++++++--------------- src/utils/path-resolve.mts | 9 +++-- 3 files changed, 107 insertions(+), 33 deletions(-) create mode 100644 src/utils/glob-oom.test.mts diff --git a/src/utils/glob-oom.test.mts b/src/utils/glob-oom.test.mts new file mode 100644 index 000000000..9082a32e0 --- /dev/null +++ b/src/utils/glob-oom.test.mts @@ -0,0 +1,72 @@ +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import path from 'node:path' + +import { describe, expect, it } from 'vitest' + +import { normalizePath } from '@socketsecurity/registry/lib/path' + +import { globWithGitIgnore } from './glob.mts' + +// Defined at module scope to satisfy linting rules. +function filterJsonFiles(filepath: string): boolean { + return filepath.endsWith('.json') +} + +// This suite lives in its own file, with no mock-fs node_modules preload, so the +// large ignore set it builds is the only significant allocation in the worker. +describe('globWithGitIgnore() large monorepo memory', () => { + // Regression: `socket fix` / `socket scan` aborted with + // `FATAL ERROR: CALL_AND_RETRY_LAST … heap out of memory` (SIGABRT) on large + // monorepos. globWithGitIgnore discovers every nested .gitignore and unions + // their patterns; handing that whole set to fast-glob's `ignore` option made + // fast-glob re-compile and re-test the entire array inside every directory + // scan, so tens of thousands of patterns exhausted V8 code space. Routing the + // gitignore set through a single reused `ignore` instance bounds the cost. The + // flat union built below is large enough to crash the old path; the walk must + // instead complete and return the right manifests. Uses the real filesystem + // because mock-fs would hold every pattern in memory and add its own overhead. + it('does not exhaust memory on a huge nested-.gitignore pattern set', async () => { + const realTmp = mkdtempSync(path.join(tmpdir(), 'socket-glob-oom-')) + try { + // 100 packages * 1000 lines = 100k distinct patterns. The pre-fix code + // (whole set handed to fast-glob, re-compiled per directory scan) exhausts + // a constrained test-worker heap at this count, while the reused `ignore` + // instance stays well within it. + const pkgCount = 100 + const linesPerPkg = 1_000 + // Each line anchors to a distinct local generated dir, so the flat union + // across packages is pkgCount * linesPerPkg distinct patterns. + const lines: string[] = [] + for (let l = 0; l < linesPerPkg; l += 1) { + lines.push(`generated_${l}/`) + } + const gitignoreBody = `${lines.join('\n')}\n` + // The root manifest and one manifest per package must be found. + writeFileSync(path.join(realTmp, 'package.json'), '{}') + const expected = [normalizePath(path.join(realTmp, 'package.json'))] + for (let d = 0; d < pkgCount; d += 1) { + const pkgDir = path.join(realTmp, 'packages', `pkg-${d}`) + const ignoredDir = path.join(pkgDir, 'generated_0') + mkdirSync(ignoredDir, { recursive: true }) + writeFileSync(path.join(pkgDir, '.gitignore'), gitignoreBody) + writeFileSync(path.join(pkgDir, 'package.json'), '{}') + // A manifest inside the package's own ignored generated dir must be + // excluded, proving the gitignore set is still honored. + writeFileSync(path.join(ignoredDir, 'package.json'), '{}') + expected.push(normalizePath(path.join(pkgDir, 'package.json'))) + } + + // Mirror the production call shape: a manifest filter forces the streaming + // branch that getPackageFilesForScan always takes. + const results = await globWithGitIgnore(['**/*'], { + cwd: realTmp, + filter: filterJsonFiles, + }) + + expect(results.map(normalizePath).sort()).toEqual(expected.sort()) + } finally { + rmSync(realTmp, { force: true, recursive: true }) + } + }, 60_000) +}) diff --git a/src/utils/glob.mts b/src/utils/glob.mts index e24cf54c7..ee7916a6e 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -290,22 +290,32 @@ export async function globWithGitIgnore( } } - let hasNegatedPattern = false - for (const p of ignores) { - if (p.charCodeAt(0) === 33 /*'!'*/) { - hasNegatedPattern = true - break - } - } + // Match every gitignore-derived pattern through a single reused `ignore` + // instance instead of handing the whole set to fast-glob's native `ignore` + // option. fast-glob re-compiles and re-tests its entire ignore array inside + // each directory scan (`node::fs::AfterScanDir`), so a large monorepo whose + // nested `.gitignore` files union to tens of thousands of patterns aborts with + // `CALL_AND_RETRY_LAST … heap out of memory`. Raising `--max-old-space-size` + // does not reliably help: much of the cost is regex executable code in V8 code + // space rather than the data heap. The `ignore` package compiles each rule + // once and memoizes it, so the cost scales with the pattern count rather than + // being multiplied by the number of directories walked. fast-glob + // keeps only the small, bounded set it needs to PRUNE directories during the + // walk (`defaultIgnore`, which already excludes node_modules and .git, plus + // the anchored CLI minimatch ignores); the high-cardinality gitignore set is + // applied per streamed entry by `ig` below. The `ignore` package also honors + // negated re-includes, which fast-glob, globby, and tinyglobby cannot express. + // The negated-pattern path already worked this way; routing both cases through + // it removes the asymmetry that left the common, non-negated case crashing on + // large repos. + const ig = ignore().add([...ignores]) const globOptions = { __proto__: null, absolute: true, cwd, dot: true, - ignore: hasNegatedPattern - ? [...defaultIgnore, ...cliMinimatchIgnores] - : [...ignores, ...cliMinimatchIgnores].map(stripTrailingSlash), + ignore: [...defaultIgnore, ...cliMinimatchIgnores], ...additionalOptions, // Skip directories the running user cannot read rather than aborting the // whole walk on the first `EACCES` (see the .gitignore discovery walk @@ -316,33 +326,22 @@ export async function globWithGitIgnore( suppressErrors: true, } as GlobOptions - // When no filter is provided and no negated patterns exist, use the fast path. - if (!hasNegatedPattern && !filter) { - return await fastGlob.glob(patterns as string[], globOptions) - } - // Add support for negated "ignore" patterns which many globbing libraries, - // including 'fast-glob', 'globby', and 'tinyglobby', lack support for. - // Use streaming to avoid unbounded memory accumulation. - // This is critical for large monorepos with 100k+ files. + // Stream results so memory stays bounded on large monorepos with 100k+ files: + // `ig` applies the gitignore matching per entry and the optional caller filter + // (e.g. manifest files only) drops non-matches before they accumulate, instead + // of collecting every path and filtering afterward. const results: string[] = [] - const ig = hasNegatedPattern ? ignore().add([...ignores]) : null const stream = fastGlob.globStream( patterns as string[], globOptions, ) as AsyncIterable for await (const p of stream) { - // Check gitignore patterns with negation support. - if (ig) { - // Note: the input files must be INSIDE the cwd. If you get strange looking - // relative path errors here, most likely your path is outside the given cwd. - const relPath = globOptions.absolute ? path.relative(cwd, p) : p - if (ig.ignores(relPath)) { - continue - } + // Note: the input files must be INSIDE the cwd. If you get strange looking + // relative path errors here, most likely your path is outside the given cwd. + const relPath = globOptions.absolute ? path.relative(cwd, p) : p + if (ig.ignores(relPath)) { + continue } - // Apply the optional filter to reduce memory usage. - // When scanning large monorepos, this filters early (e.g., to manifest files only) - // instead of accumulating all 100k+ files and filtering later. if (filter && !filter(p)) { continue } diff --git a/src/utils/path-resolve.mts b/src/utils/path-resolve.mts index 247d81ede..85a2491f8 100644 --- a/src/utils/path-resolve.mts +++ b/src/utils/path-resolve.mts @@ -144,9 +144,12 @@ export async function getPackageFilesForScan( ...options, } as PackageFilesForScanOptions - // Apply the supported files filter during streaming to avoid accumulating - // all files in memory. This is critical for large monorepos with 100k+ files - // where accumulating all paths before filtering causes OOM errors. + // Apply the supported files filter during streaming so globWithGitIgnore drops + // non-manifest paths as they are walked instead of collecting every path first. + // This bounds RESULT-path memory on large monorepos with 100k+ files. Note it + // does NOT bound the gitignore ignore-pattern memory: that OOM (regex compile + // exhausting V8 code space) is handled inside globWithGitIgnore by matching the + // gitignore set through a single reused `ignore` instance. const filter = createSupportedFilesFilter(supportedFiles) const normalizedInputPaths = inputPaths.map(p => From 775e0f2cb885207e2d7455a8ff92e278b5a60c58 Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Tue, 23 Jun 2026 14:50:06 +0200 Subject: [PATCH 2/3] fix(glob): keep gitignore matching case-sensitive and POSIX-normalized Routing the non-negated path through the ignore package introduced two parity gaps versus fast-glob's native ignore matching: - The ignore package defaults to case-insensitive matching, while fast-glob (caseSensitiveMatch defaults to true) and git match case-sensitively. Build the matcher with ignorecase derived from caseSensitiveMatch so a `dist/` entry no longer also ignores a differently-cased `Dist/` sibling. - path.relative yields backslash-separated paths on Windows, which never match the forward-slash-anchored patterns. Normalize the relative path with normalizePath before ig.ignores(), matching how the patterns are anchored. Add a case-sensitivity regression test (dist/ vs Dist/). --- src/utils/glob.mts | 16 ++++++++++++++-- src/utils/glob.test.mts | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/utils/glob.mts b/src/utils/glob.mts index ee7916a6e..e5c1c3402 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -8,6 +8,7 @@ import { parse as yamlParse } from 'yaml' import { isDirSync, safeReadFile } from '@socketsecurity/registry/lib/fs' import { defaultIgnore } from '@socketsecurity/registry/lib/globs' import { readPackageJson } from '@socketsecurity/registry/lib/packages' +import { normalizePath } from '@socketsecurity/registry/lib/path' import { transform } from '@socketsecurity/registry/lib/streams' import { isNonEmptyString } from '@socketsecurity/registry/lib/strings' @@ -308,7 +309,13 @@ export async function globWithGitIgnore( // The negated-pattern path already worked this way; routing both cases through // it removes the asymmetry that left the common, non-negated case crashing on // large repos. - const ig = ignore().add([...ignores]) + // Match fast-glob's case sensitivity (its `caseSensitiveMatch` defaults to + // true) so routing the non-negated path through the `ignore` package does not + // silently start matching case-insensitively, which is the `ignore` package's + // own default. + const ig = ignore({ + ignorecase: additionalOptions.caseSensitiveMatch === false, + }).add([...ignores]) const globOptions = { __proto__: null, @@ -338,7 +345,12 @@ export async function globWithGitIgnore( for await (const p of stream) { // Note: the input files must be INSIDE the cwd. If you get strange looking // relative path errors here, most likely your path is outside the given cwd. - const relPath = globOptions.absolute ? path.relative(cwd, p) : p + // Normalize to POSIX separators: the `ignore` patterns are forward-slash + // anchored (see ignoreFileLinesToGlobPatterns), so a Windows backslash path + // from path.relative would never match them. + const relPath = normalizePath( + globOptions.absolute ? path.relative(cwd, p) : p, + ) if (ig.ignores(relPath)) { continue } diff --git a/src/utils/glob.test.mts b/src/utils/glob.test.mts index f403306cd..e381c5e02 100644 --- a/src/utils/glob.test.mts +++ b/src/utils/glob.test.mts @@ -210,6 +210,28 @@ describe('glob utilities', () => { ]) }) + it('matches gitignore entries case-sensitively, like fast-glob', async () => { + // The `ignore` package defaults to case-insensitive matching, but + // fast-glob (caseSensitiveMatch defaults to true) and git treat the + // ignore set case-sensitively. A `dist/` entry must ignore `dist/` but + // leave a differently-cased `Dist/` sibling alone. + mockTestFs({ + [`${mockFixturePath}/.gitignore`]: 'dist/\n', + [`${mockFixturePath}/package.json`]: '{}', + [`${mockFixturePath}/dist/a.json`]: '{}', + [`${mockFixturePath}/Dist/b.json`]: '{}', + }) + + const results = await globWithGitIgnore(['**/*.json'], { + cwd: mockFixturePath, + }) + + expect(results.map(normalizePath).sort()).toEqual([ + `${mockFixturePath}/Dist/b.json`, + `${mockFixturePath}/package.json`, + ]) + }) + it('keeps additionalIgnores anchored even when a gitignore negation forces the streaming path', async () => { // A bare `tests` pattern means "the entry `tests` at the scan root". // The streaming path uses the `ignore` package for gitignore-translated From 90890ee5d4a4169eb32b1ea70be4a518bc9f284c Mon Sep 17 00:00:00 2001 From: Simon Jensen Date: Tue, 23 Jun 2026 15:34:55 +0200 Subject: [PATCH 3/3] style(glob): trim comments to at most four lines --- src/utils/glob-oom.test.mts | 14 ++++---------- src/utils/glob.mts | 33 +++++++-------------------------- src/utils/path-resolve.mts | 8 +++----- 3 files changed, 14 insertions(+), 41 deletions(-) diff --git a/src/utils/glob-oom.test.mts b/src/utils/glob-oom.test.mts index 9082a32e0..f1fd8f11f 100644 --- a/src/utils/glob-oom.test.mts +++ b/src/utils/glob-oom.test.mts @@ -16,16 +16,10 @@ function filterJsonFiles(filepath: string): boolean { // This suite lives in its own file, with no mock-fs node_modules preload, so the // large ignore set it builds is the only significant allocation in the worker. describe('globWithGitIgnore() large monorepo memory', () => { - // Regression: `socket fix` / `socket scan` aborted with - // `FATAL ERROR: CALL_AND_RETRY_LAST … heap out of memory` (SIGABRT) on large - // monorepos. globWithGitIgnore discovers every nested .gitignore and unions - // their patterns; handing that whole set to fast-glob's `ignore` option made - // fast-glob re-compile and re-test the entire array inside every directory - // scan, so tens of thousands of patterns exhausted V8 code space. Routing the - // gitignore set through a single reused `ignore` instance bounds the cost. The - // flat union built below is large enough to crash the old path; the walk must - // instead complete and return the right manifests. Uses the real filesystem - // because mock-fs would hold every pattern in memory and add its own overhead. + // Regression: scanning a large monorepo OOM'd because the whole unioned + // gitignore set was handed to fast-glob, which recompiled it per directory + // scan. The 100k-pattern tree below crashes the pre-fix path; the walk must + // complete with the right manifests. Real fs (mock-fs is too heavy here). it('does not exhaust memory on a huge nested-.gitignore pattern set', async () => { const realTmp = mkdtempSync(path.join(tmpdir(), 'socket-glob-oom-')) try { diff --git a/src/utils/glob.mts b/src/utils/glob.mts index e5c1c3402..a31c03c5d 100644 --- a/src/utils/glob.mts +++ b/src/utils/glob.mts @@ -291,28 +291,10 @@ export async function globWithGitIgnore( } } - // Match every gitignore-derived pattern through a single reused `ignore` - // instance instead of handing the whole set to fast-glob's native `ignore` - // option. fast-glob re-compiles and re-tests its entire ignore array inside - // each directory scan (`node::fs::AfterScanDir`), so a large monorepo whose - // nested `.gitignore` files union to tens of thousands of patterns aborts with - // `CALL_AND_RETRY_LAST … heap out of memory`. Raising `--max-old-space-size` - // does not reliably help: much of the cost is regex executable code in V8 code - // space rather than the data heap. The `ignore` package compiles each rule - // once and memoizes it, so the cost scales with the pattern count rather than - // being multiplied by the number of directories walked. fast-glob - // keeps only the small, bounded set it needs to PRUNE directories during the - // walk (`defaultIgnore`, which already excludes node_modules and .git, plus - // the anchored CLI minimatch ignores); the high-cardinality gitignore set is - // applied per streamed entry by `ig` below. The `ignore` package also honors - // negated re-includes, which fast-glob, globby, and tinyglobby cannot express. - // The negated-pattern path already worked this way; routing both cases through - // it removes the asymmetry that left the common, non-negated case crashing on - // large repos. - // Match fast-glob's case sensitivity (its `caseSensitiveMatch` defaults to - // true) so routing the non-negated path through the `ignore` package does not - // silently start matching case-insensitively, which is the `ignore` package's - // own default. + // Match the high-cardinality gitignore set through one reused `ignore` + // instance, not fast-glob's `ignore` (which recompiles its whole array per + // directory scan and OOMs on tens of thousands of patterns); fast-glob keeps + // only the bounded prune set. `ignorecase` tracks fast-glob's default. const ig = ignore({ ignorecase: additionalOptions.caseSensitiveMatch === false, }).add([...ignores]) @@ -343,11 +325,10 @@ export async function globWithGitIgnore( globOptions, ) as AsyncIterable for await (const p of stream) { - // Note: the input files must be INSIDE the cwd. If you get strange looking - // relative path errors here, most likely your path is outside the given cwd. // Normalize to POSIX separators: the `ignore` patterns are forward-slash - // anchored (see ignoreFileLinesToGlobPatterns), so a Windows backslash path - // from path.relative would never match them. + // anchored (ignoreFileLinesToGlobPatterns), so a Windows backslash path from + // path.relative would never match. Input must be inside cwd, else + // path.relative returns an odd `..`-prefixed relative path. const relPath = normalizePath( globOptions.absolute ? path.relative(cwd, p) : p, ) diff --git a/src/utils/path-resolve.mts b/src/utils/path-resolve.mts index 85a2491f8..d2c1791ac 100644 --- a/src/utils/path-resolve.mts +++ b/src/utils/path-resolve.mts @@ -145,11 +145,9 @@ export async function getPackageFilesForScan( } as PackageFilesForScanOptions // Apply the supported files filter during streaming so globWithGitIgnore drops - // non-manifest paths as they are walked instead of collecting every path first. - // This bounds RESULT-path memory on large monorepos with 100k+ files. Note it - // does NOT bound the gitignore ignore-pattern memory: that OOM (regex compile - // exhausting V8 code space) is handled inside globWithGitIgnore by matching the - // gitignore set through a single reused `ignore` instance. + // non-manifest paths as they are walked. This bounds RESULT-path memory; it + // does NOT bound the gitignore ignore-pattern memory (that OOM is handled + // inside globWithGitIgnore via a single reused `ignore` instance). const filter = createSupportedFilesFilter(supportedFiles) const normalizedInputPaths = inputPaths.map(p =>