Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ OpenKB settings are initialized by `openkb init` and stored in `.openkb/config.y
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
file_processing_jobs: 2 # Files to prepare concurrently during `openkb add <dir>`
```

Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
Expand All @@ -372,6 +373,8 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p
<summary><i>Advanced options (<code>entity_types</code>, <code>extra_headers</code>, OAuth):</i></summary>
<br>

`file_processing_jobs` (default `2`): number of files prepared concurrently during `openkb add <dir>`. Only the preparation stage is parallelized (hashing, duplicate prefiltering, raw/source staging, conversion); live-KB mutation stays serialized under the mutation lock, so raising it helps mainly when conversion is the bottleneck.

`entity_types` (optional): a YAML list overriding the entity-type vocabulary used for entity pages; omit it to use the default `person`, `organization`, `place`, `product`, `work`, `event`, `other`.

`extra_headers` (optional): a YAML mapping of extra HTTP headers sent with every LLM request (forwarded to LiteLLM's `extra_headers`). Useful for providers that expect custom headers, e.g. GitHub Copilot IDE-auth headers:
Expand Down
4 changes: 4 additions & 0 deletions config.yaml.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
file_processing_jobs: 2 # Number of files to prepare concurrently during `openkb add <dir>`
# Note: this parallelizes hashing/conversion/staging only. Live KB publish,
# PageIndex indexing, LLM compilation, registry updates, and log writes remain
# serialized under the KB mutation lock.

# Optional: extra HTTP headers sent with every LLM request (forwarded to
# LiteLLM's extra_headers). Some providers need these — e.g. GitHub Copilot
Expand Down
29 changes: 15 additions & 14 deletions openkb/agent/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from openkb import frontmatter
from openkb.config import DEFAULT_ENTITY_TYPES, get_extra_headers, resolve_entity_types
from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks
from openkb.locks import atomic_write_text
from openkb.schema import INDEX_SEED, get_agents_md

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -768,7 +769,7 @@ def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
fm_lines.append(f"doc_type: {doc_type}")
fm_lines.append(_yaml_kv_line("full_text", f"sources/{doc_name}.{ext}"))
fm_block = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
(summaries_dir / f"{doc_name}.md").write_text(fm_block + summary, encoding="utf-8")
atomic_write_text(summaries_dir / f"{doc_name}.md", fm_block + summary)


_SAFE_NAME_RE = re.compile(r'[^\w\-]')
Expand Down Expand Up @@ -828,7 +829,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
if brief:
fm_lines.append(_yaml_kv_line("description", brief))
existing = frontmatter.block(fm_lines) + clean
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
return
# Guarantee type + refresh description on update; remove legacy brief:.
ex_parts2 = frontmatter.split(existing)
Expand All @@ -840,7 +841,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
# Drop legacy brief: lines (migrated to description:).
fm_block = frontmatter.drop_line(fm_block, "brief")
existing = fm_block + body
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
else:
clean_parts = frontmatter.split(content)
if clean_parts is not None:
Expand All @@ -852,7 +853,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
if brief:
fm_lines.append(_yaml_kv_line("description", brief))
fm_block = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
path.write_text(fm_block + content, encoding="utf-8")
atomic_write_text(path, fm_block + content)


def _write_entity(
Expand Down Expand Up @@ -916,10 +917,10 @@ def _build_entity_frontmatter(sources: list[str]) -> str:
break
merged = [source_file] + [s for s in recovered if s != source_file]
existing = _build_entity_frontmatter(merged) + clean
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
return

path.write_text(_build_entity_frontmatter([source_file]) + clean, encoding="utf-8")
atomic_write_text(path, _build_entity_frontmatter([source_file]) + clean)


_set_fm_line = frontmatter.set_line
Expand Down Expand Up @@ -1024,7 +1025,7 @@ def _add_related_link(
text = _prepend_source_to_frontmatter(text, source_file)

text += f"\n\nSee also: {link}"
path.write_text(text, encoding="utf-8")
atomic_write_text(path, text)
return True


Expand All @@ -1051,7 +1052,7 @@ def _backlink_summary_pages(
_ensure_h2_section(lines, section, quiet=True)
for slug in reversed(missing):
_insert_section_entry(lines, section, f"- [[{page_dir}/{slug}]]")
summary_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(summary_path, "\n".join(lines))


def _backlink_pages(
Expand All @@ -1072,7 +1073,7 @@ def _backlink_pages(
lines = text.split("\n")
_ensure_h2_section(lines, "## Related Documents", quiet=True)
_insert_section_entry(lines, "## Related Documents", f"- {link}")
path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(path, "\n".join(lines))


def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
Expand Down Expand Up @@ -1178,7 +1179,7 @@ def _remove_doc_from_pages(
path.unlink()
deleted.append(path.stem)
elif new_text != text:
path.write_text(new_text, encoding="utf-8")
atomic_write_text(path, new_text)
modified.append(path.stem)

return {"modified": modified, "deleted": deleted}
Expand Down Expand Up @@ -1274,7 +1275,7 @@ def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted:
while _remove_section_entry(lines, "## Entities", entity_link):
pass

index_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(index_path, "\n".join(lines))


def _update_index(
Expand All @@ -1298,7 +1299,7 @@ def _update_index(

index_path = wiki_dir / "index.md"
if not index_path.exists():
index_path.write_text(INDEX_SEED, encoding="utf-8")
atomic_write_text(index_path, INDEX_SEED)

lines = index_path.read_text(encoding="utf-8").split("\n")

Expand Down Expand Up @@ -1344,7 +1345,7 @@ def _update_index(
else:
_insert_section_entry(lines, "## Entities", entry)

index_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(index_path, "\n".join(lines))


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -2018,7 +2019,7 @@ async def compile_long_doc(
updated = fm_block + body
if updated != summary_content:
summary_content = updated
summary_path.write_text(summary_content, encoding="utf-8")
atomic_write_text(summary_path, summary_content)

# Base context A. cache_control marker on the doc message creates a
# cache breakpoint covering (system + doc) for every concept call.
Expand Down
Loading