Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions parser/sqlfn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
"""Attach the SQL-name map (@sqlfn / @sqlop) to the MEOS-API catalog.

The catalog carries MEOS-C function names + C signatures, but bindings that
emit a SQL/UDF surface (MobilityDB SQL, MobilitySpark UDFs, MobilityDuck, …)
need the user-facing SQL name and operator. Both are machine-extractable from
the doxygen tag chain that already pervades the source:

MEOS-C fn --@csqlfn #MobilityDB_C()--> MobilityDB-C wrapper
MobilityDB-C wrapper --@sqlfn sqlName() / @sqlop @p <op>--> SQL name + op

So: in meos/src `@csqlfn #Wrapper()` sits above the MEOS-C function (→ MEOS-C →
Wrapper); in mobilitydb/src `@sqlfn name()` + `@sqlop @p <op>` sit above
`Datum Wrapper(PG_FUNCTION_ARGS)` (→ Wrapper → name, op). Join on Wrapper.

Adds per function (when the chain resolves): `sqlfn`, `sqlop`, `mdbC`.
"""
import re
from pathlib import Path

# A @csqlfn tag carries one OR MORE #Wrapper() references — comma- or
# space-separated, and possibly continued across doxygen lines — because a single
# MEOS function can back several wrappers (the ever/always pair eDisjoint/aDisjoint
# share one ea_* function; the shift/scale/shift_scale trio share one C function).
# The tag value runs from @csqlfn up to the next doxygen tag or the comment close.
_CSQLFN = re.compile(r"@csqlfn\b")
_CSQLFN_REF = re.compile(r"#(\w+)\s*\(\)")
_CSQLFN_END = re.compile(r"@\w|\*/")
# After the doxygen close, the MEOS-C definition: an optional return-type line
# (no parens/braces/;/=), then `name(`.
_FNDEF = re.compile(r"\*/\s*\n(?:[^\n(){};=]+\n)?(\w+)\s*\(")
_SQLFN = re.compile(r"@sqlfn\s+(\w+)\s*\(\)")
_SQLOP = re.compile(r"@sqlop\s+@p\s+(\S+)")
_DATUM = re.compile(r"Datum\s+(\w+)\s*\(\s*PG_FUNCTION_ARGS")


def _meos_to_mdb(meos_src):
"""MEOS-C function name -> ordered list of MobilityDB-C wrapper names (from
@csqlfn). One MEOS function can back more than one wrapper — the ever/always
pair eDisjoint/aDisjoint share a single ea_* function tagged
`@csqlfn #Edisjoint_…() #Adisjoint_…()` — so each @csqlfn carries one or more
#Wrapper() references; collect them all (mirrors _mdb_to_sql collecting every
@sqlfn rather than the first)."""
out = {}
for cf in Path(meos_src).rglob("*.c"):
text = cf.read_text(errors="ignore")
for m in _CSQLFN.finditer(text):
tail = text[m.end():]
end = _CSQLFN_END.search(tail)
value = tail[:end.start()] if end else tail
wrappers = _CSQLFN_REF.findall(value)
if not wrappers:
continue
fm = _FNDEF.search(text, m.end())
if not fm:
continue
lst = out.setdefault(fm.group(1), [])
for w in wrappers:
if w not in lst:
lst.append(w)
return out


def _mdb_to_sql(mdb_src):
"""MobilityDB-C wrapper name -> ordered list of (sqlfn, sqlop).

A shared PG wrapper can carry more than one @sqlfn (e.g. Temporal_derivative
is exposed as both derivative() and speed()), so collect ALL of them rather
than the first — otherwise the mapped SQL name is order-dependent.
"""
out = {}
for cf in Path(mdb_src).rglob("*.c"):
text = cf.read_text(errors="ignore")
for m in _SQLFN.finditer(text):
sqlfn = m.group(1)
# @sqlop lives in the SAME doxygen block (before the closing */).
close = text.find("*/", m.end())
block = text[m.start():close] if close != -1 else text[m.start():m.start() + 800]
op = _SQLOP.search(block)
dm = _DATUM.search(text, close if close != -1 else m.end())
if dm:
entry = (sqlfn, op.group(1) if op else None)
lst = out.setdefault(dm.group(1), [])
if entry not in lst:
lst.append(entry)
return out


def attach_sqlfn_map(idl, meos_src, mdb_src):
m2d = _meos_to_mdb(meos_src)
d2s = _mdb_to_sql(mdb_src)
n = 0
for f in idl["functions"]:
wrappers = m2d.get(f["name"])
if not wrappers:
continue
# A MEOS function can back several wrappers (the ever/always pair), each
# carrying its own @sqlfn; collect the (sqlfn, sqlop) pairs across all of
# them in order, keeping the primary (first) wrapper for back-compat.
pairs = []
for w in wrappers:
for entry in d2s.get(w, []):
if entry not in pairs:
pairs.append(entry)
if not pairs:
continue
f["mdbC"] = wrappers[0]
f["sqlfn"] = pairs[0][0]
if pairs[0][1]:
f["sqlop"] = pairs[0][1]
# Shared wrapper OR ever/always pair exposing >1 SQL name: record them all.
if len(pairs) > 1:
f["sqlfnAll"] = [s for s, _ in pairs]
if len(wrappers) > 1:
f["mdbCAll"] = wrappers
n += 1
return idl, n


# MEOS-C ever/always spatial-relationship functions are named <e|a><verb>_...; their
# @csqlfn must point at the matching <E|A><verb>_... wrapper. A copy-paste @csqlfn in
# meos/src (e.g. eintersects_tgeo_geo tagged #Aintersects_tgeo_geo) silently flips the
# resolved @sqlfn from eX to aX — which then drops the real overload from the eX dispatch
# group and lets a wrong subtype backing be reached (a runtime "must be of type ..." error
# in the bindings). The parser is faithful, so guard the SOURCE here: flag any function
# whose name e/a prefix disagrees with its resolved @sqlfn e/a prefix.
_EA_FAMILY = re.compile(
r"^(e|a)(intersects|disjoint|contains|contained|covers|coveredby|touches|"
r"dwithin|within|equals|crosses|overlaps)_")


def lint_ea_sqlfn(idl):
"""Return [(meos_c_name, sqlfn)] where the function's ever/always (e/a) name prefix
contradicts its resolved @sqlfn — a source @csqlfn mistag in meos/src."""
bad = []
for f in idl["functions"]:
sf = f.get("sqlfn")
m = _EA_FAMILY.match(f["name"])
if sf and m and re.match(r"^[ea][A-Z]", sf) and sf[0] != m.group(1):
bad.append((f["name"], sf))
return bad


def lint_sqlfn_case_collisions(idl):
"""Return [(lower, [spelling, ...])] for @sqlfn names that collide
case-insensitively but differ in case (e.g. tDistance vs tdistance).

PostgreSQL folds unquoted identifiers to lower case, so the two spell the
SAME SQL function and the clash is invisible in SQL / pg_regress. But the
binding name is taken case-SENSITIVELY, and case-insensitive engines (Spark
SQL, …) register every spelling under one UDF — so one silently shadows the
other. A canonical binding name must have exactly ONE spelling; surface a
casing straggler here before it reaches a binding."""
by_lower = {}
for f in idl["functions"]:
for sf in [f.get("sqlfn"), *f.get("sqlfnAll", [])]:
if sf:
by_lower.setdefault(sf.lower(), set()).add(sf)
return sorted((lo, sorted(sp)) for lo, sp in by_lower.items() if len(sp) > 1)
34 changes: 34 additions & 0 deletions run.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import sys
import json
from pathlib import Path

from parser.parser import parse_all_headers, merge_meta
from parser.portable import attach_portable_aliases
from parser.typerecover import recover_collapsed_types
from parser.sqlfn import attach_sqlfn_map, lint_ea_sqlfn, lint_sqlfn_case_collisions


HEADERS_DIR = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./meos/include")
Expand Down Expand Up @@ -40,6 +42,38 @@ def main():
file=sys.stderr)
idl = attach_portable_aliases(idl, PORTABLE_PATH)

# 4. Attach the SQL-name map (@sqlfn/@sqlop) from the vendored source.
# The source root is overridable (MDB_SRC_ROOT) so a binding can point the
# @sqlfn/@ingroup extraction at the SAME pinned checkout as the headers,
# keeping the catalog reproducibly equivalent to that pin.
SRC_ROOT = Path(os.environ.get("MDB_SRC_ROOT", "./_mobilitydb"))
MEOS_SRC = SRC_ROOT / "meos" / "src"
MDB_SRC = SRC_ROOT / "mobilitydb" / "src"
if MEOS_SRC.exists() and MDB_SRC.exists():
idl, nsql = attach_sqlfn_map(idl, MEOS_SRC, MDB_SRC)
print(f"[4/4] Attached {nsql} @sqlfn SQL names", file=sys.stderr)
# Guard: a copy-paste @csqlfn in meos/src can point an ever/always function at
# the opposite-prefix wrapper (eintersects_* tagged #Aintersects_*), flipping its
# SQL name and breaking the binding overload dispatch. The parser is faithful, so
# surface the SOURCE mistag here rather than ship a wrong catalog silently.
ea_bad = lint_ea_sqlfn(idl)
if ea_bad:
print(f" ⚠ {len(ea_bad)} @csqlfn e/a-prefix mistag(es) in meos/src "
f"(fix at source — wrong @sqlfn resolved):", file=sys.stderr)
for cname, sf in ea_bad:
print(f" {cname} -> @sqlfn {sf}", file=sys.stderr)
# Guard: @sqlfn names that differ only by case (e.g. tDistance vs tdistance)
# are the SAME SQL function (PostgreSQL folds the identifier) but DISTINCT
# binding names — a case-insensitive engine (Spark SQL) registers both under
# one UDF, so one silently shadows the other. Invisible in SQL; surface the
# casing straggler here, to be fixed at the MEOS-C @sqlfn source.
case_bad = lint_sqlfn_case_collisions(idl)
if case_bad:
print(f" ⚠ {len(case_bad)} @sqlfn case-collision(s) (pick ONE canonical "
f"spelling at the MEOS-C source — binding-breaking otherwise):", file=sys.stderr)
for _lo, spellings in case_bad:
print(f" {' vs '.join(spellings)}", file=sys.stderr)

idl_path = OUTPUT_DIR / "meos-idl.json"
with open(idl_path, "w") as f:
json.dump(idl, f, indent=2)
Expand Down