From 2b55c16bc79ea9beb9d0279ce95180f5da2252bd Mon Sep 17 00:00:00 2001 From: stacknil Date: Tue, 30 Jun 2026 10:27:26 +0800 Subject: [PATCH] test(parser): add mixed coverage artifact --- CHANGELOG.md | 1 + assets/mixed_auth_parser_coverage.json | 92 ++++++++++++++++++++++ docs/parser-conformance-matrix.md | 2 +- docs/parser-contract.md | 1 + docs/parser-coverage-notes.md | 2 + docs/reviewer-path.md | 3 +- tests/test_parser.cpp | 105 +++++++++++++++++++++++++ 7 files changed, 204 insertions(+), 2 deletions(-) create mode 100644 assets/mixed_auth_parser_coverage.json diff --git a/CHANGELOG.md b/CHANGELOG.md index ec30359..221c45e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable user-visible changes should be recorded here. - Added `verdict_boundary` to JSON findings and advanced the report artifact contract to `loglens.report.v2`. - Expanded parser coverage for `Accepted publickey` and selected `pam_faillock` / `pam_sss` variants. - Added a 150-line sanitized mixed auth corpus fixture covering Ubuntu / Debian-style `auth.log`, RHEL-family `secure`-style syslog, unknown lines, malformed source IPs, and blank-line handling. +- Added a reviewer-facing parser coverage JSON artifact for the mixed auth corpus. - Added compact host-level summaries for multi-host reports. - Added optional CSV export for findings and warnings when explicitly requested. diff --git a/assets/mixed_auth_parser_coverage.json b/assets/mixed_auth_parser_coverage.json new file mode 100644 index 0000000..0c5e638 --- /dev/null +++ b/assets/mixed_auth_parser_coverage.json @@ -0,0 +1,92 @@ +{ + "artifact": "loglens.parser_coverage_sample", + "schema_version": 1, + "fixture": "assets/mixed_auth_corpus.log", + "input_mode": "syslog_legacy", + "assume_year": 2026, + "parser_quality": { + "total_input_lines": 150, + "total_lines": 140, + "skipped_blank_lines": 10, + "parsed_lines": 90, + "unparsed_lines": 50, + "parse_success_rate": 0.6428571429, + "top_unknown_patterns": [ + {"pattern": "invalid_month_token", "count": 10}, + {"pattern": "malformed_source_ip", "count": 10}, + {"pattern": "pam_unix_session_closed", "count": 10}, + {"pattern": "program_cron", "count": 10}, + {"pattern": "sshd_connection_closed_preauth", "count": 10} + ], + "failure_categories": [ + {"category": "known_program_unknown_message", "count": 10}, + {"category": "malformed_source_ip", "count": 10}, + {"category": "unknown_program", "count": 10}, + {"category": "unknown_timestamp", "count": 10}, + {"category": "unsupported_pam_variant", "count": 10} + ] + }, + "parsed_event_count": 90, + "warning_count": 50, + "event_type_counts": [ + {"event_type": "ssh_accepted_publickey", "count": 10}, + {"event_type": "ssh_invalid_user", "count": 10}, + {"event_type": "ssh_failed_publickey", "count": 10}, + {"event_type": "pam_auth_failure", "count": 30}, + {"event_type": "sudo_command", "count": 10}, + {"event_type": "sudo_auth_failure", "count": 10}, + {"event_type": "su_auth_failure", "count": 10} + ], + "warnings": [ + {"line_number": 10, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 11, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 12, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 13, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 14, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 25, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 26, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 27, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 28, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 29, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 40, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 41, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 42, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 43, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 44, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 55, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 56, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 57, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 58, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 59, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 70, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 71, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 72, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 73, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 74, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 85, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 86, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 87, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 88, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 89, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 100, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 101, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 102, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 103, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 104, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 115, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 116, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 117, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 118, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 119, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 130, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 131, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 132, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 133, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 134, "category": "unknown_timestamp", "reason": "invalid month token"}, + {"line_number": 145, "category": "known_program_unknown_message", "reason": "unrecognized auth pattern: sshd_connection_closed_preauth"}, + {"line_number": 146, "category": "unsupported_pam_variant", "reason": "unrecognized auth pattern: pam_unix_session_closed"}, + {"line_number": 147, "category": "unknown_program", "reason": "unrecognized auth pattern: program_cron"}, + {"line_number": 148, "category": "malformed_source_ip", "reason": "malformed source IP"}, + {"line_number": 150, "category": "unknown_timestamp", "reason": "invalid month token"} + ] +} diff --git a/docs/parser-conformance-matrix.md b/docs/parser-conformance-matrix.md index 8fc9efd..87bdbba 100644 --- a/docs/parser-conformance-matrix.md +++ b/docs/parser-conformance-matrix.md @@ -132,7 +132,7 @@ coverage telemetry path. | [`assets/parser_auth_families_syslog.log`](../assets/parser_auth_families_syslog.log) | Selected `sshd`, `pam_unix`, `pam_faillock`, `pam_sss`, and session-opened auth-family support, plus five unsupported PAM-family telemetry buckets | | [`assets/parser_auth_families_journalctl_short_full.log`](../assets/parser_auth_families_journalctl_short_full.log) | Same auth-family event and warning shape as the syslog auth-family fixture, with journalctl timestamp parsing | | [`assets/noisy_auth_sample.log`](../assets/noisy_auth_sample.log) and [`tests/fixtures/parser_matrix/noisy_auth_expected.json`](../tests/fixtures/parser_matrix/noisy_auth_expected.json) | Noisy syslog coverage fixture with malformed lines, blank lines, unsupported auth-family evidence, irrelevant service lines, and locked parser quality counts | -| [`assets/mixed_auth_corpus.log`](../assets/mixed_auth_corpus.log) | 150-line sanitized mixed syslog corpus with Ubuntu / Debian-style `auth.log` and RHEL-family `secure` host labels, 90 parsed events, 50 parser warnings, 10 blank lines, and locked unknown-pattern and failure-category coverage | +| [`assets/mixed_auth_corpus.log`](../assets/mixed_auth_corpus.log) and [`assets/mixed_auth_parser_coverage.json`](../assets/mixed_auth_parser_coverage.json) | 150-line sanitized mixed syslog corpus with Ubuntu / Debian-style `auth.log` and RHEL-family `secure` host labels, 90 parsed events, 50 parser warnings, 10 blank lines, and locked unknown-pattern and failure-category coverage | ## Review Rule diff --git a/docs/parser-contract.md b/docs/parser-contract.md index 0299d99..f13aae4 100644 --- a/docs/parser-contract.md +++ b/docs/parser-contract.md @@ -89,6 +89,7 @@ Parsed successes and audit-only events remain reportable but do not count as bru | [`assets/parser_auth_families_syslog.log`](../assets/parser_auth_families_syslog.log) | Syslog PAM/auth-family parser coverage | | [`assets/parser_auth_families_journalctl_short_full.log`](../assets/parser_auth_families_journalctl_short_full.log) | Journalctl PAM/auth-family parser coverage | | [`assets/noisy_auth_sample.log`](../assets/noisy_auth_sample.log) and [`tests/fixtures/parser_matrix/noisy_auth_expected.json`](../tests/fixtures/parser_matrix/noisy_auth_expected.json) | Noisy syslog parser-coverage matrix for malformed, unsupported, blank, irrelevant, multi-host, and unusual-username input | +| [`assets/mixed_auth_corpus.log`](../assets/mixed_auth_corpus.log) and [`assets/mixed_auth_parser_coverage.json`](../assets/mixed_auth_parser_coverage.json) | 150-line mixed auth corpus plus reviewer-facing parser coverage artifact for dirty syslog input | | [`tests/test_report_contracts.cpp`](../tests/test_report_contracts.cpp) | Stable report-shape expectations for generated artifacts | ## Non-goals diff --git a/docs/parser-coverage-notes.md b/docs/parser-coverage-notes.md index 5825569..87ffb78 100644 --- a/docs/parser-coverage-notes.md +++ b/docs/parser-coverage-notes.md @@ -29,6 +29,8 @@ The locked expected coverage summary lives in [`tests/fixtures/parser_matrix/noi The corpus repeats ten small evidence batches. Each batch includes recognized `sshd`, `sudo`, `su`, `pam_unix`, `pam_faillock`, and `pam_sss` evidence; unsupported `sshd` preauth and `pam_unix` session-close telemetry; an unsupported service program; a malformed source IP; an invalid timestamp; and one blank line. +For reviewer inspection without running the test suite, [`assets/mixed_auth_parser_coverage.json`](../assets/mixed_auth_parser_coverage.json) captures the deterministic parser coverage view for this corpus: parser-quality counters, normalized event-type counts, unknown-pattern buckets, failure categories, and warning line references. + Locked parser expectations: - `total_input_lines`: 150 diff --git a/docs/reviewer-path.md b/docs/reviewer-path.md index 3e345b7..f88381d 100644 --- a/docs/reviewer-path.md +++ b/docs/reviewer-path.md @@ -10,7 +10,7 @@ This path is for reviewers who want to understand LogLens quickly without readin | What log formats are supported? | [`docs/parser-contract.md`](./parser-contract.md) | Can name `syslog_legacy` and `journalctl_short_full` behavior | | What artifacts does it produce? | [`docs/report-artifacts.md`](./report-artifacts.md) and report-contract fixtures | Can inspect Markdown, JSON, and optional CSV outputs | | How do rules use evidence? | [`docs/rule-catalog.md`](./rule-catalog.md) | Can explain grouping keys, windows, thresholds, and unsupported-evidence boundaries | -| Can the parser behavior be trusted? | Parser contract, fixture matrix, and parser coverage fields | Can see known, unknown, and malformed line handling | +| Can the parser behavior be trusted? | Parser contract, fixture matrix, and [`assets/mixed_auth_parser_coverage.json`](../assets/mixed_auth_parser_coverage.json) | Can see known, unknown, and malformed line handling | | What proves the main claims? | [`docs/quality-gates.md`](./quality-gates.md) | Can map claims to tests, fixtures, docs, and repeatable commands | | How should a finding be interpreted? | [`docs/case-study-linux-auth-bruteforce.md`](./case-study-linux-auth-bruteforce.md) | Can trace raw evidence to normalized events, findings, warnings, and non-goals | | How does it behave on larger local inputs? | [`docs/performance-envelope.md`](./performance-envelope.md) | Can state the local 1k/10k/100k-line envelope and its caveats | @@ -43,6 +43,7 @@ Inspect: - [`tests/fixtures/report_contracts/syslog_legacy/report.json`](../tests/fixtures/report_contracts/syslog_legacy/report.json) - [`docs/report-artifacts.md`](./report-artifacts.md) - [`docs/parser-contract.md`](./parser-contract.md) +- [`assets/mixed_auth_parser_coverage.json`](../assets/mixed_auth_parser_coverage.json) - [`docs/quality-gates.md`](./quality-gates.md) - [`docs/rule-catalog.md`](./rule-catalog.md) - [`docs/case-study-linux-auth-bruteforce.md`](./case-study-linux-auth-bruteforce.md) diff --git a/tests/test_parser.cpp b/tests/test_parser.cpp index cf77c2c..4b15cad 100644 --- a/tests/test_parser.cpp +++ b/tests/test_parser.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace { @@ -107,6 +108,42 @@ std::size_t failure_category_count(const loglens::ParserQualityMetrics& quality, return it == quality.failure_categories.end() ? 0 : it->count; } +std::vector> parser_event_type_counts( + const std::vector& events) { + std::vector> counts{ + {loglens::EventType::SshFailedPassword, 0}, + {loglens::EventType::SshAcceptedPassword, 0}, + {loglens::EventType::SshAcceptedPublicKey, 0}, + {loglens::EventType::SshAcceptedKeyboardInteractive, 0}, + {loglens::EventType::SshInvalidUser, 0}, + {loglens::EventType::SshFailedPublicKey, 0}, + {loglens::EventType::SshFailedKeyboardInteractive, 0}, + {loglens::EventType::SshMaxAuthTries, 0}, + {loglens::EventType::PamAuthFailure, 0}, + {loglens::EventType::SessionOpened, 0}, + {loglens::EventType::SudoCommand, 0}, + {loglens::EventType::SudoAuthFailure, 0}, + {loglens::EventType::SudoPolicyDenied, 0}, + {loglens::EventType::SuAuthFailure, 0}}; + + for (const auto& event : events) { + for (auto& [type, count] : counts) { + if (type == event.event_type) { + ++count; + break; + } + } + } + + counts.erase( + std::remove_if(counts.begin(), counts.end(), [](const auto& entry) { + return entry.second == 0; + }), + counts.end()); + + return counts; +} + std::string noisy_auth_coverage_json(const loglens::ParseReport& result) { std::ostringstream output; output << "{\n" @@ -156,6 +193,70 @@ std::string noisy_auth_coverage_json(const loglens::ParseReport& result) { return output.str(); } +std::string mixed_auth_coverage_json(const loglens::ParseReport& result) { + std::ostringstream output; + const auto event_counts = parser_event_type_counts(result.events); + + output << "{\n" + << " \"artifact\": \"loglens.parser_coverage_sample\",\n" + << " \"schema_version\": 1,\n" + << " \"fixture\": \"assets/mixed_auth_corpus.log\",\n" + << " \"input_mode\": \"" << loglens::to_string(result.metadata.input_mode) << "\",\n" + << " \"assume_year\": " << *result.metadata.assume_year << ",\n" + << " \"parser_quality\": {\n" + << " \"total_input_lines\": " << total_input_lines(result) << ",\n" + << " \"total_lines\": " << result.quality.total_lines << ",\n" + << " \"skipped_blank_lines\": " << result.quality.skipped_blank_lines << ",\n" + << " \"parsed_lines\": " << result.quality.parsed_lines << ",\n" + << " \"unparsed_lines\": " << result.quality.unparsed_lines << ",\n" + << " \"parse_success_rate\": " << std::fixed << std::setprecision(10) + << result.quality.parse_success_rate << ",\n" + << " \"top_unknown_patterns\": [\n"; + + for (std::size_t index = 0; index < result.quality.top_unknown_patterns.size(); ++index) { + const auto& entry = result.quality.top_unknown_patterns[index]; + output << " {\"pattern\": \"" << entry.pattern << "\", \"count\": " << entry.count << "}"; + output << (index + 1 == result.quality.top_unknown_patterns.size() ? "\n" : ",\n"); + } + + output << " ],\n" + << " \"failure_categories\": [\n"; + + for (std::size_t index = 0; index < result.quality.failure_categories.size(); ++index) { + const auto& entry = result.quality.failure_categories[index]; + output << " {\"category\": \"" << loglens::to_string(entry.category) + << "\", \"count\": " << entry.count << "}"; + output << (index + 1 == result.quality.failure_categories.size() ? "\n" : ",\n"); + } + + output << " ]\n" + << " },\n" + << " \"parsed_event_count\": " << result.events.size() << ",\n" + << " \"warning_count\": " << result.warnings.size() << ",\n" + << " \"event_type_counts\": [\n"; + + for (std::size_t index = 0; index < event_counts.size(); ++index) { + const auto& [type, count] = event_counts[index]; + output << " {\"event_type\": \"" << loglens::to_string(type) << "\", \"count\": " << count << "}"; + output << (index + 1 == event_counts.size() ? "\n" : ",\n"); + } + + output << " ],\n" + << " \"warnings\": [\n"; + + for (std::size_t index = 0; index < result.warnings.size(); ++index) { + const auto& warning = result.warnings[index]; + output << " {\"line_number\": " << warning.line_number + << ", \"category\": \"" << loglens::to_string(warning.category) << "\"" + << ", \"reason\": \"" << warning.reason << "\"}"; + output << (index + 1 == result.warnings.size() ? "\n" : ",\n"); + } + + output << " ]\n" + << "}\n"; + return output.str(); +} + void test_invalid_user_failure() { const auto parser = make_syslog_parser(); std::string error; @@ -1186,6 +1287,10 @@ void test_mixed_auth_corpus_fixture_file() { "expected ten unknown-timestamp failures"); expect(failure_category_count(result.quality, loglens::ParserFailureCategory::UnsupportedPamVariant) == 10, "expected ten unsupported-PAM-variant failures"); + + const auto actual = mixed_auth_coverage_json(result); + const auto expected = read_text_file(asset_path("mixed_auth_parser_coverage.json")); + expect(actual == expected, "expected mixed auth parser coverage artifact to match fixture"); } } // namespace