diff --git a/CHANGELOG.md b/CHANGELOG.md index 618b96dad7..bca5a38435 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## Unreleased + +### Features + +- Add the `data_collection` option, a structured configuration that supersedes `send_default_pii` for controlling what data integrations collect automatically (user identity, cookies, HTTP headers, query params, HTTP bodies, generative AI inputs/outputs, stack frame variables, source context). See the [Data Collection spec](https://develop.sentry.dev/sdk/foundations/client/data-collection/). + - Adds `sentry_sdk.DataCollection`, `KeyValueCollectionBehavior`, `HttpHeadersCollection`, and `GenAICollection`. + - When `data_collection` is not set, behavior is derived from `send_default_pii` (now deprecated), so upgrading without configuring `data_collection` changes nothing. + - `frame_context_lines` is now configurable (previously hardcoded to 5); AI integrations' `include_prompts` becomes a per-integration override of `data_collection.gen_ai`. + ## 2.63.0 ### Bug Fixes 🐛 diff --git a/README.md b/README.md index 7a7bf8b44f..060e48e314 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,11 @@ sentry_sdk.init( # Set traces_sample_rate to 1.0 to capture 100% # of traces for performance monitoring. traces_sample_rate=1.0, + + # To disable sending user data and HTTP request/response bodies, uncomment + # the line below. For more info visit: + # https://docs.sentry.io/platforms/python/configuration/options/#data_collection + # data_collection=sentry_sdk.DataCollection(user_info=False, http_bodies=[]), ) ``` diff --git a/sentry_sdk/__init__.py b/sentry_sdk/__init__.py index 8ce8d739c9..ec27d153a3 100644 --- a/sentry_sdk/__init__.py +++ b/sentry_sdk/__init__.py @@ -2,6 +2,12 @@ from sentry_sdk.scope import Scope # isort: skip from sentry_sdk.client import Client # isort: skip +from sentry_sdk.data_collection import ( # isort: skip + DataCollection, + GenAICollection, + HttpHeadersCollection, + KeyValueCollectionBehavior, +) from sentry_sdk.consts import VERSION from sentry_sdk.transport import HttpTransport, Transport @@ -11,6 +17,10 @@ "Hub", "Scope", "Client", + "DataCollection", + "GenAICollection", + "HttpHeadersCollection", + "KeyValueCollectionBehavior", "Transport", "HttpTransport", "VERSION", diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index 81fb8c385c..32b1943d64 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -23,6 +23,12 @@ VERSION, ClientConstructor, ) +from sentry_sdk.data_collection import ( + OFF_DATA_COLLECTION, + DataCollection, + _map_from_send_default_pii, + resolve_data_collection, +) from sentry_sdk.envelope import Envelope, Item, PayloadRef from sentry_sdk.integrations import _DEFAULT_INTEGRATIONS, setup_integrations from sentry_sdk.integrations.dedupe import DedupeIntegration @@ -345,11 +351,11 @@ def _get_options(*args: "Optional[str]", **kwargs: "Any") -> "Dict[str, Any]": if rv["enable_tracing"] is True and rv["traces_sample_rate"] is None: rv["traces_sample_rate"] = 1.0 + rv["data_collection"] = resolve_data_collection(rv) + if rv["event_scrubber"] is None: rv["event_scrubber"] = EventScrubber( - send_default_pii=( - False if rv["send_default_pii"] is None else rv["send_default_pii"] - ) + send_default_pii=rv["data_collection"].user_info ) if rv["socket_options"] and not isinstance(rv["socket_options"], list): @@ -425,6 +431,23 @@ def parsed_dsn(self) -> "Optional[Dsn]": def should_send_default_pii(self) -> bool: return False + @property + def data_collection(self) -> "DataCollection": + return OFF_DATA_COLLECTION + + def should_collect_user_info(self) -> bool: + return False + + def should_collect_gen_ai_inputs( + self, include_prompts: "Optional[bool]" = None + ) -> bool: + return False + + def should_collect_gen_ai_outputs( + self, include_prompts: "Optional[bool]" = None + ) -> bool: + return False + def is_active(self) -> bool: """ .. versionadded:: 2.0.0 @@ -614,6 +637,17 @@ def _record_lost_event( self.options["error_sampler"] = sample_all self.options["traces_sampler"] = sample_all self.options["profiles_sampler"] = sample_all + # data_collection was resolved in _get_options() before this + # spotlight override flipped send_default_pii on. Re-derive it so + # the should_collect_* accessors agree with should_send_default_pii() + # in DSN-less spotlight mode (only when the user did not set + # data_collection explicitly). + if not self.options["data_collection"].explicit: + self.options["data_collection"] = _map_from_send_default_pii( + True, + self.options["include_local_variables"] is not False, + self.options["include_source_context"] is not False, + ) self.session_flusher = SessionFlusher(capture_func=_capture_envelope) @@ -724,6 +758,59 @@ def should_send_default_pii(self) -> bool: """ return self.options.get("send_default_pii") or False + @property + def data_collection(self) -> "DataCollection": + """ + Returns the resolved :class:`~sentry_sdk.data_collection.DataCollection` + config for this client. + """ + dc = self.options.get("data_collection") + return dc if dc is not None else OFF_DATA_COLLECTION + + def should_collect_user_info(self) -> bool: + """ + Returns whether the SDK should automatically populate ``user.*`` fields + (id, email, username, ip_address) from instrumentation. + """ + return bool(self.data_collection.user_info) + + def should_collect_gen_ai_inputs( + self, include_prompts: "Optional[bool]" = None + ) -> bool: + """ + Returns whether the SDK should collect generative AI input content. + + ``include_prompts`` is the integration-level override (if set, it takes + precedence over the global ``data_collection.gen_ai.inputs`` setting). + """ + return self._should_collect_gen_ai_content("inputs", include_prompts) + + def should_collect_gen_ai_outputs( + self, include_prompts: "Optional[bool]" = None + ) -> bool: + """ + Returns whether the SDK should collect generative AI output content. + + ``include_prompts`` is the integration-level override (if set, it takes + precedence over the global ``data_collection.gen_ai.outputs`` setting). + """ + return self._should_collect_gen_ai_content("outputs", include_prompts) + + def _should_collect_gen_ai_content( + self, direction: str, include_prompts: "Optional[bool]" + ) -> bool: + dc = self.data_collection + if dc.explicit: + # Integration-level override wins over the global gen_ai setting. + if include_prompts is not None: + return include_prompts + return bool(getattr(dc.gen_ai, direction)) + # Legacy (data_collection not set): preserve the historical gate + # `should_send_default_pii() and integration.include_prompts`. + # `include_prompts is None` means "no integration-level override", which + # falls back to the legacy default of True (collect when PII is on). + return self.should_send_default_pii() and (include_prompts is not False) + @property def dsn(self) -> "Optional[str]": """Returns the configured DSN as string.""" diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index b85b179223..276ac70f23 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -56,6 +56,7 @@ class CompressionAlgo(Enum): TracesSampler, TransactionProcessor, ) + from sentry_sdk.data_collection import DataCollection # Experiments are feature flags to enable and disable certain unstable SDK # functionality. Changing them from the defaults (`None`) in production @@ -1272,6 +1273,7 @@ def __init__( transport_queue_size: int = DEFAULT_QUEUE_SIZE, sample_rate: float = 1.0, send_default_pii: "Optional[bool]" = None, + data_collection: "Optional[Union[DataCollection, Dict[str, Any]]]" = None, http_proxy: "Optional[str]" = None, https_proxy: "Optional[str]" = None, ignore_errors: "Sequence[Union[type, str]]" = [], # noqa: B006 @@ -1426,6 +1428,26 @@ def __init__( If you enable this option, be sure to manually remove what you don't want to send using our features for managing `Sensitive Data `_. + .. deprecated:: + Use `data_collection` instead. `send_default_pii` is still honored when `data_collection` is not set. + + :param data_collection: Structured configuration controlling what data integrations collect automatically, + superseding `send_default_pii`. Pass a dict or a :class:`sentry_sdk.DataCollection` instance to enable or + restrict collection per category (user identity, cookies, HTTP headers/bodies, query params, generative AI + inputs/outputs, stack frame variables, source context). + + When `data_collection` is set, omitted fields use their defaults (most categories are collected, with the + sensitive denylist scrubbing values). When it is not set, the SDK derives behavior from `send_default_pii` + so that upgrading without configuring `data_collection` changes nothing. If both are set, `data_collection` + takes precedence. + + Example:: + + sentry_sdk.init( + dsn="...", + data_collection={"user_info": False, "http_bodies": []}, + ) + :param event_scrubber: Scrubs the event payload for sensitive information such as cookies, sessions, and passwords from a `denylist`. diff --git a/sentry_sdk/data_collection.py b/sentry_sdk/data_collection.py new file mode 100644 index 0000000000..0a18320729 --- /dev/null +++ b/sentry_sdk/data_collection.py @@ -0,0 +1,606 @@ +""" +Data Collection configuration. + +Implements the ``data_collection`` client option described in the Sentry SDK +"Data Collection" spec +(https://develop.sentry.dev/sdk/foundations/client/data-collection/). + +``data_collection`` supersedes the single ``send_default_pii`` boolean with a +structured configuration that lets users enable or restrict automatically +collected data by category (user identity, cookies, HTTP headers, query params, +HTTP bodies, generative AI inputs/outputs, stack frame variables, source +context). + +Resolution precedence (see :func:`resolve_data_collection`): + +* ``data_collection`` set, ``send_default_pii`` unset -> honor ``data_collection`` + using the spec defaults for any omitted field. +* ``send_default_pii`` set, ``data_collection`` unset -> derive a + ``DataCollection`` that mirrors what ``send_default_pii`` collects today. +* neither set -> treated as ``send_default_pii=False``. +* both set -> ``data_collection`` wins (it is the single source of truth); a + ``DeprecationWarning`` is emitted for ``send_default_pii``. + +The new collection-time filtering mechanisms (the partial-match sensitive +denylist and allow/deny key-value modes) only become active when +``data_collection`` is provided explicitly. Otherwise the SDK keeps its existing +behavior so that upgrading without configuring ``data_collection`` changes +nothing. +""" + +import warnings +from typing import TYPE_CHECKING +from urllib.parse import parse_qsl, urlencode + +from sentry_sdk._types import SENSITIVE_DATA_SUBSTITUTE + +if TYPE_CHECKING: + from typing import Any, Dict, List, Mapping, Optional + + +__all__ = [ + "DataCollection", + "KeyValueCollectionBehavior", + "GenAICollection", + "HttpHeadersCollection", + "SENSITIVE_DENYLIST", + "EXTENDED_GDPR_DENYLIST", +] + + +#: Body type identifiers accepted by ``DataCollection.http_bodies``. These match +#: the spec's camelCase string values so configuration is portable across SDKs. +BODY_TYPE_INCOMING_REQUEST = "incomingRequest" +BODY_TYPE_OUTGOING_REQUEST = "outgoingRequest" +BODY_TYPE_INCOMING_RESPONSE = "incomingResponse" +BODY_TYPE_OUTGOING_RESPONSE = "outgoingResponse" + +#: All valid body types. ``http_bodies`` defaults to this (collect everything the +#: platform supports); an empty list is the explicit opt-out. +ALL_BODY_TYPES = [ + BODY_TYPE_INCOMING_REQUEST, + BODY_TYPE_OUTGOING_REQUEST, + BODY_TYPE_INCOMING_RESPONSE, + BODY_TYPE_OUTGOING_RESPONSE, +] + +#: Default number of source lines captured above and below a stack frame. +DEFAULT_FRAME_CONTEXT_LINES = 5 + +#: Collection modes for key-value data (cookies, headers, query params). +COLLECTION_OFF = "off" +COLLECTION_DENYLIST = "denyList" +COLLECTION_ALLOWLIST = "allowList" +_VALID_MODES = (COLLECTION_OFF, COLLECTION_DENYLIST, COLLECTION_ALLOWLIST) + +#: Canonical sensitive denylist from the spec. Values of keys that contain any of +#: these terms (partial, case-insensitive) are always replaced with +#: ``"[Filtered]"`` regardless of the configured collection mode. +SENSITIVE_DENYLIST = [ + "auth", + "token", + "secret", + "password", + "passwd", + "pwd", + "key", + "jwt", + "bearer", + "sso", + "saml", + "csrf", + "xsrf", + "credentials", + "session", + "sid", + "identity", +] + +#: Additional GDPR-sensitive terms users may opt into via custom deny terms. +#: Not applied automatically; documented here for convenience. +EXTENDED_GDPR_DENYLIST = ["forwarded", "-ip", "remote-", "via", "-user"] + + +class KeyValueCollectionBehavior: + """ + Controls which *values* of key-value data (cookies, headers, query params) + are sent in plaintext versus replaced with ``"[Filtered]"``. Key names are + always retained. + + :param mode: one of ``"off"``, ``"denyList"`` (default), ``"allowList"``. + :param terms: deny or allow terms (depending on ``mode``) that extend the + built-in sensitive denylist. Matched as a partial, case-insensitive + substring of the key name. + """ + + __slots__ = ("mode", "terms") + + def __init__(self, mode: str = "denyList", terms: "Optional[List[str]]" = None): + if mode not in _VALID_MODES: + raise ValueError( + "Invalid KeyValueCollectionBehavior mode {!r}. Must be one of {}.".format( + mode, _VALID_MODES + ) + ) + self.mode = mode + self.terms: "List[str]" = list(terms) if terms else [] + + def __repr__(self) -> str: + return "KeyValueCollectionBehavior(mode={!r}, terms={!r})".format( + self.mode, self.terms + ) + + +class GenAICollection: + """ + Controls capture of generative AI input and output *content*. Metadata such + as model name and token counts is always collected regardless of these + settings. + """ + + __slots__ = ("inputs", "outputs") + + def __init__(self, inputs: bool = True, outputs: bool = True): + self.inputs = inputs + self.outputs = outputs + + def __repr__(self) -> str: + return "GenAICollection(inputs={!r}, outputs={!r})".format( + self.inputs, self.outputs + ) + + +class HttpHeadersCollection: + """ + Configures request and response header collection independently. Each + direction is a :class:`KeyValueCollectionBehavior`. + """ + + __slots__ = ("request", "response") + + def __init__( + self, + request: "Optional[KeyValueCollectionBehavior]" = None, + response: "Optional[KeyValueCollectionBehavior]" = None, + ): + self.request: "KeyValueCollectionBehavior" = ( + request if request is not None else KeyValueCollectionBehavior() + ) + self.response: "KeyValueCollectionBehavior" = ( + response if response is not None else KeyValueCollectionBehavior() + ) + + def __repr__(self) -> str: + return "HttpHeadersCollection(request={!r}, response={!r})".format( + self.request, self.response + ) + + +class DataCollection: + """ + The ``data_collection`` client option. + + Pass an instance to ``sentry_sdk.init(data_collection=...)``. Any field left + as ``None`` is filled in with its spec default during resolution (see + :func:`resolve_data_collection`). After resolution the instance stored on the + client has concrete values for every field. + + :param user_info: automatically populate ``user.*`` fields (id, email, + username, ip_address) from instrumentation. Default ``True``. + :param cookies: cookie collection behavior. Default ``denyList``. + :param http_headers: request/response header collection. Default + ``denyList`` for both directions. + :param http_bodies: list of body types to collect. ``None`` -> all valid + types; ``[]`` -> off. + :param query_params: URL query parameter collection. Default ``denyList``. + :param gen_ai: generative AI input/output content collection. Default both + ``True``. + :param stack_frame_variables: include local variable values in stack frames. + Default ``True`` (falls back to ``include_local_variables``). + :param frame_context_lines: number of source lines above/below each frame. + Default ``5`` (falls back to ``include_source_context``). + """ + + __slots__ = ( + "user_info", + "cookies", + "http_headers", + "http_bodies", + "query_params", + "gen_ai", + "stack_frame_variables", + "frame_context_lines", + "explicit", + ) + + def __init__( + self, + user_info: bool = True, + cookies: "Optional[KeyValueCollectionBehavior]" = None, + http_headers: "Optional[HttpHeadersCollection]" = None, + http_bodies: "Optional[List[str]]" = None, + query_params: "Optional[KeyValueCollectionBehavior]" = None, + gen_ai: "Optional[GenAICollection]" = None, + stack_frame_variables: "Optional[bool]" = None, + frame_context_lines: "Optional[int]" = None, + ): + # Fields with no legacy fallback default to their spec value, so they are + # always concrete (never None) on a constructed instance. + self.user_info = user_info + self.cookies = cookies if cookies is not None else KeyValueCollectionBehavior() + self.http_headers = ( + http_headers if http_headers is not None else HttpHeadersCollection() + ) + # http_bodies is None == "all valid types"; [] == off. + self.http_bodies = http_bodies + self.query_params = ( + query_params if query_params is not None else KeyValueCollectionBehavior() + ) + self.gen_ai = gen_ai if gen_ai is not None else GenAICollection() + # Frame fields keep None as "inherit from include_local_variables / + # include_source_context" so resolution can apply the legacy fallback. + self.stack_frame_variables = stack_frame_variables + self.frame_context_lines = frame_context_lines + # Whether the user supplied ``data_collection`` explicitly. Set during + # resolution. Collection-time filtering only changes from legacy behavior + # when this is True. + self.explicit: bool = False + + def __repr__(self) -> str: + return ( + "DataCollection(user_info={!r}, cookies={!r}, http_headers={!r}, " + "http_bodies={!r}, query_params={!r}, gen_ai={!r}, " + "stack_frame_variables={!r}, frame_context_lines={!r}, explicit={!r})" + ).format( + self.user_info, + self.cookies, + self.http_headers, + self.http_bodies, + self.query_params, + self.gen_ai, + self.stack_frame_variables, + self.frame_context_lines, + self.explicit, + ) + + +def is_sensitive_key(key: str, extra_terms: "Optional[List[str]]" = None) -> bool: + """ + Return whether ``key`` matches the sensitive denylist using a partial, + case-insensitive substring match. + + :param extra_terms: additional deny terms (e.g. user-provided) to consider + alongside the built-in :data:`SENSITIVE_DENYLIST`. + """ + lowered = key.lower() + for term in SENSITIVE_DENYLIST: + if term in lowered: + return True + if extra_terms: + for term in extra_terms: + if term and term.lower() in lowered: + return True + return False + + +def apply_key_value_collection( + items: "Mapping[str, Any]", + behavior: "KeyValueCollectionBehavior", + substitute: "Any" = SENSITIVE_DATA_SUBSTITUTE, +) -> "Dict[str, Any]": + """ + Apply a :class:`KeyValueCollectionBehavior` to a mapping of key-value pairs. + + Returns a new dict. Key names are always retained (except for ``off`` mode, + which collects nothing). Sensitive keys (built-in denylist) are always + scrubbed, even under ``allowList`` mode. + """ + if behavior.mode == COLLECTION_OFF: + return {} + + result: "Dict[str, Any]" = {} + + if behavior.mode == COLLECTION_ALLOWLIST: + # behavior.terms is the ALLOW list here (not deny terms). A key sends its + # real value only if it matches an allow term AND is not sensitive (the + # built-in sensitive denylist always wins, even for allow-listed keys). + for key, value in items.items(): + allowed = False + if isinstance(key, str): + lowered = key.lower() + allowed = any( + term and term.lower() in lowered for term in behavior.terms + ) + if allowed and not is_sensitive_key(key): + result[key] = value + else: + result[key] = substitute + return result + + # denyList (default): collect everything, scrub sensitive values. + for key, value in items.items(): + if isinstance(key, str) and is_sensitive_key(key, behavior.terms): + result[key] = substitute + else: + result[key] = value + return result + + +#: Header names whose raw value must never be sent. Cookies are collected +#: separately as parsed key-value pairs (see the cookies option); the raw +#: Cookie/Set-Cookie header value is always filtered (spec: unfiltered raw cookie +#: header values MUST NOT be sent). +_ALWAYS_FILTERED_HEADERS = ("cookie", "set-cookie") + + +def filter_request_headers( + headers: "Mapping[str, Any]", + behavior: "KeyValueCollectionBehavior", + substitute: "Any" = SENSITIVE_DATA_SUBSTITUTE, +) -> "Dict[str, Any]": + """ + Apply a header :class:`KeyValueCollectionBehavior`, additionally always + filtering the raw Cookie/Set-Cookie header values. + """ + filtered = apply_key_value_collection(headers, behavior, substitute=substitute) + for key in filtered: + if isinstance(key, str) and key.lower() in _ALWAYS_FILTERED_HEADERS: + filtered[key] = substitute + return filtered + + +def scrub_query_string( + query_string: str, + behavior: "KeyValueCollectionBehavior", +) -> "Optional[str]": + """ + Apply a query-param :class:`KeyValueCollectionBehavior` to a raw query + string. + + Returns ``None`` when the mode is ``off`` (do not collect the query string at + all), the scrubbed query string otherwise. An unparseable query string is + replaced entirely with ``"[Filtered]"``. + """ + if behavior.mode == COLLECTION_OFF: + return None + + try: + pairs = parse_qsl(query_string, keep_blank_values=True) + except Exception: + return SENSITIVE_DATA_SUBSTITUTE + + if not pairs: + return query_string + + scrubbed = [] + for key, value in pairs: + if behavior.mode == COLLECTION_ALLOWLIST: + allowed = any( + term and term.lower() in key.lower() for term in behavior.terms + ) + scrubbed.append( + ( + key, + value + if (allowed and not is_sensitive_key(key)) + else SENSITIVE_DATA_SUBSTITUTE, + ) + ) + else: # denyList + scrubbed.append( + ( + key, + SENSITIVE_DATA_SUBSTITUTE + if is_sensitive_key(key, behavior.terms) + else value, + ) + ) + return urlencode(scrubbed) + + +def should_collect_body_type( + data_collection: "DataCollection", + body_type: str, +) -> bool: + """Return whether the given body type should be collected.""" + bodies = data_collection.http_bodies + if bodies is None: + return True + return body_type in bodies + + +def _map_from_send_default_pii( + send_default_pii: bool, + include_local_variables: bool, + include_source_context: bool, +) -> "DataCollection": + """ + Build a fully-resolved :class:`DataCollection` that mirrors the data + ``send_default_pii`` collects today. Used when ``data_collection`` is not + provided explicitly (resolution cases B and C). + """ + resolved = DataCollection( + user_info=send_default_pii, + cookies=KeyValueCollectionBehavior( + COLLECTION_DENYLIST if send_default_pii else COLLECTION_OFF + ), + # Headers are collected in both PII modes today (sensitive ones filtered + # when PII is off), so this never maps to "off". + http_headers=HttpHeadersCollection(), + # Bodies are collected regardless of PII today, bounded by + # ``max_request_body_size``. + http_bodies=list(ALL_BODY_TYPES), + query_params=KeyValueCollectionBehavior( + COLLECTION_DENYLIST if send_default_pii else COLLECTION_OFF + ), + gen_ai=GenAICollection(inputs=send_default_pii, outputs=send_default_pii), + stack_frame_variables=include_local_variables, + frame_context_lines=( + DEFAULT_FRAME_CONTEXT_LINES if include_source_context else 0 + ), + ) + resolved.explicit = False + return resolved + + +def _resolve_explicit( + user_dc: "DataCollection", + include_local_variables: bool, + include_source_context: bool, +) -> "DataCollection": + """ + Fill in any omitted fields of a user-supplied ``DataCollection`` with their + spec defaults (resolution case A). Frame fields fall back to the legacy + ``include_local_variables`` / ``include_source_context`` options when unset. + """ + # frame_context_lines accepts an integer or a boolean fallback (spec: True + # -> platform default of 5, False -> 0). bool is a subclass of int, so + # coerce explicitly before treating it as a line count. + frame_context_lines = user_dc.frame_context_lines + if frame_context_lines is None: + frame_context_lines = ( + DEFAULT_FRAME_CONTEXT_LINES if include_source_context else 0 + ) + elif isinstance(frame_context_lines, bool): + frame_context_lines = DEFAULT_FRAME_CONTEXT_LINES if frame_context_lines else 0 + + resolved = DataCollection( + # These fields are always concrete on a constructed DataCollection. + user_info=user_dc.user_info, + cookies=user_dc.cookies, + http_headers=user_dc.http_headers, + query_params=user_dc.query_params, + gen_ai=user_dc.gen_ai, + # http_bodies: None means "all valid types"; materialize for clarity. + http_bodies=( + list(user_dc.http_bodies) + if user_dc.http_bodies is not None + else list(ALL_BODY_TYPES) + ), + # Frame fields fall back to the legacy options when unset. + stack_frame_variables=( + user_dc.stack_frame_variables + if user_dc.stack_frame_variables is not None + else include_local_variables + ), + frame_context_lines=frame_context_lines, + ) + resolved.explicit = True + return resolved + + +def _data_collection_from_dict(d: "Dict[str, Any]") -> "DataCollection": + """Convert a plain dict into a :class:`DataCollection`.""" + kwargs: "Dict[str, Any]" = {} + + if "user_info" in d: + kwargs["user_info"] = d["user_info"] + if "cookies" in d: + kwargs["cookies"] = _kvcb_from_value(d["cookies"]) + if "http_headers" in d: + kwargs["http_headers"] = _http_headers_from_value(d["http_headers"]) + if "http_bodies" in d: + kwargs["http_bodies"] = d["http_bodies"] + if "query_params" in d: + kwargs["query_params"] = _kvcb_from_value(d["query_params"]) + if "gen_ai" in d: + kwargs["gen_ai"] = _gen_ai_from_value(d["gen_ai"]) + if "stack_frame_variables" in d: + kwargs["stack_frame_variables"] = d["stack_frame_variables"] + if "frame_context_lines" in d: + kwargs["frame_context_lines"] = d["frame_context_lines"] + + return DataCollection(**kwargs) + + +def _kvcb_from_value(val: "Any") -> "KeyValueCollectionBehavior": + """Coerce a string or dict to :class:`KeyValueCollectionBehavior`.""" + if isinstance(val, KeyValueCollectionBehavior): + return val + if isinstance(val, str): + return KeyValueCollectionBehavior(mode=val) + if isinstance(val, dict): + return KeyValueCollectionBehavior(**val) + raise TypeError( + "Expected a KeyValueCollectionBehavior, string, or dict, got {!r}".format( + type(val).__name__ + ) + ) + + +def _http_headers_from_value(val: "Any") -> "HttpHeadersCollection": + """Coerce a dict to :class:`HttpHeadersCollection`.""" + if isinstance(val, HttpHeadersCollection): + return val + if isinstance(val, dict): + kwargs: "Dict[str, Any]" = {} + if "request" in val: + kwargs["request"] = _kvcb_from_value(val["request"]) + if "response" in val: + kwargs["response"] = _kvcb_from_value(val["response"]) + return HttpHeadersCollection(**kwargs) + raise TypeError( + "Expected an HttpHeadersCollection or dict, got {!r}".format(type(val).__name__) + ) + + +def _gen_ai_from_value(val: "Any") -> "GenAICollection": + """Coerce a dict to :class:`GenAICollection`.""" + if isinstance(val, GenAICollection): + return val + if isinstance(val, dict): + return GenAICollection(**val) + raise TypeError( + "Expected a GenAICollection or dict, got {!r}".format(type(val).__name__) + ) + + +def resolve_data_collection(options: "Dict[str, Any]") -> "DataCollection": + """ + Resolve the effective :class:`DataCollection` from client ``options``. + + Reads ``data_collection``, ``send_default_pii``, ``include_local_variables`` + and ``include_source_context`` and returns a fully-resolved instance with + concrete values for every field. + + ``data_collection`` may be a :class:`DataCollection` instance or a plain + ``dict`` (which is converted automatically). + """ + user_dc = options.get("data_collection") + send_default_pii = options.get("send_default_pii") + include_local_variables = options.get("include_local_variables") + if include_local_variables is None: + include_local_variables = True + include_source_context = options.get("include_source_context") + if include_source_context is None: + include_source_context = True + + if user_dc is not None: + if isinstance(user_dc, dict): + user_dc = _data_collection_from_dict(user_dc) + elif not isinstance(user_dc, DataCollection): + raise TypeError( + "`data_collection` must be a dict or sentry_sdk.DataCollection " + "instance, got {!r}.".format(type(user_dc).__name__) + ) + if send_default_pii is not None: + warnings.warn( + "`send_default_pii` is deprecated and ignored when " + "`data_collection` is set. `data_collection` is the single " + "source of truth for automatic data collection.", + DeprecationWarning, + stacklevel=2, + ) + return _resolve_explicit( + user_dc, include_local_variables, include_source_context + ) + + return _map_from_send_default_pii( + bool(send_default_pii), include_local_variables, include_source_context + ) + + +#: Safe default used by non-recording clients: collect nothing PII-gated. +#: This is a shared, process-wide singleton. Treat it as read-only — do not +#: mutate the returned ``DataCollection`` or its nested config objects. +OFF_DATA_COLLECTION = _map_from_send_default_pii(False, True, True) diff --git a/sentry_sdk/scope.py b/sentry_sdk/scope.py index cec5e767c3..6392f5d2ce 100644 --- a/sentry_sdk/scope.py +++ b/sentry_sdk/scope.py @@ -98,6 +98,7 @@ SamplingContext, Type, ) + from sentry_sdk.data_collection import DataCollection from sentry_sdk.tracing import TransactionKwargs P = ParamSpec("P") @@ -2173,6 +2174,26 @@ def should_send_default_pii() -> bool: return Scope.get_client().should_send_default_pii() +def should_collect_user_info() -> bool: + """Shortcut for `Scope.get_client().should_collect_user_info()`.""" + return Scope.get_client().should_collect_user_info() + + +def should_collect_gen_ai_inputs(include_prompts: "Optional[bool]" = None) -> bool: + """Shortcut for `Scope.get_client().should_collect_gen_ai_inputs(...)`.""" + return Scope.get_client().should_collect_gen_ai_inputs(include_prompts) + + +def should_collect_gen_ai_outputs(include_prompts: "Optional[bool]" = None) -> bool: + """Shortcut for `Scope.get_client().should_collect_gen_ai_outputs(...)`.""" + return Scope.get_client().should_collect_gen_ai_outputs(include_prompts) + + +def get_data_collection() -> "DataCollection": + """Return the resolved DataCollection config of the active client.""" + return Scope.get_client().data_collection + + # Circular imports from sentry_sdk.client import NonRecordingClient diff --git a/tests/test_data_collection.py b/tests/test_data_collection.py new file mode 100644 index 0000000000..4f9aed3dd0 --- /dev/null +++ b/tests/test_data_collection.py @@ -0,0 +1,361 @@ +import warnings + +import pytest + +import sentry_sdk +from sentry_sdk import ( + DataCollection, + GenAICollection, + HttpHeadersCollection, + KeyValueCollectionBehavior, +) +from sentry_sdk.data_collection import ( + ALL_BODY_TYPES, + SENSITIVE_DENYLIST, + apply_key_value_collection, + filter_request_headers, + is_sensitive_key, + resolve_data_collection, + scrub_query_string, + should_collect_body_type, +) + +# --------------------------------------------------------------------------- +# Sensitive denylist (partial, case-insensitive) +# --------------------------------------------------------------------------- + + +def test_sensitive_denylist_matches_spec(): + assert SENSITIVE_DENYLIST == [ + "auth", + "token", + "secret", + "password", + "passwd", + "pwd", + "key", + "jwt", + "bearer", + "sso", + "saml", + "csrf", + "xsrf", + "credentials", + "session", + "sid", + "identity", + ] + + +@pytest.mark.parametrize( + "key,expected", + [ + ("Authorization", True), # contains "auth" + ("X-Auth-Token", True), + ("authorization", True), + ("PASSWORD", True), + ("X-Api-Key", True), # contains "key" + ("sessionid", True), # contains "session" and "sid" + ("Accept", False), + ("Content-Type", False), + ("X-Request-Id", False), + ], +) +def test_is_sensitive_key(key, expected): + assert is_sensitive_key(key) is expected + + +def test_is_sensitive_key_extra_terms(): + assert is_sensitive_key("x-forwarded-for", ["forwarded"]) is True + assert is_sensitive_key("x-forwarded-for") is False + + +# --------------------------------------------------------------------------- +# Key-value collection behavior +# --------------------------------------------------------------------------- + + +def test_kvcb_invalid_mode(): + with pytest.raises(ValueError): + KeyValueCollectionBehavior(mode="nope") + + +def test_apply_off(): + assert ( + apply_key_value_collection({"a": "1"}, KeyValueCollectionBehavior("off")) == {} + ) + + +def test_apply_denylist_scrubs_sensitive_keeps_rest(): + items = {"Authorization": "secret", "Accept": "json", "X-Id": "1"} + out = apply_key_value_collection(items, KeyValueCollectionBehavior("denyList")) + assert out == {"Authorization": "[Filtered]", "Accept": "json", "X-Id": "1"} + + +def test_apply_denylist_extra_terms(): + items = {"X-Custom": "v", "Accept": "json"} + out = apply_key_value_collection( + items, KeyValueCollectionBehavior("denyList", ["x-custom"]) + ) + assert out == {"X-Custom": "[Filtered]", "Accept": "json"} + + +def test_apply_allowlist_only_allowed_real(): + items = {"X-Request-Id": "r1", "Accept": "json", "Authorization": "x"} + out = apply_key_value_collection( + items, KeyValueCollectionBehavior("allowList", ["x-request-id"]) + ) + assert out == { + "X-Request-Id": "r1", + "Accept": "[Filtered]", + "Authorization": "[Filtered]", + } + + +def test_apply_allowlist_sensitive_always_scrubbed(): + # Even if a sensitive key is allow-listed, it is still scrubbed. + items = {"Authorization": "x"} + out = apply_key_value_collection( + items, KeyValueCollectionBehavior("allowList", ["authorization"]) + ) + assert out == {"Authorization": "[Filtered]"} + + +def test_filter_request_headers_always_filters_cookie(): + items = {"Cookie": "a=b", "Set-Cookie": "c=d", "Accept": "json"} + out = filter_request_headers(items, KeyValueCollectionBehavior("denyList")) + assert out == { + "Cookie": "[Filtered]", + "Set-Cookie": "[Filtered]", + "Accept": "json", + } + + +# --------------------------------------------------------------------------- +# Query string scrubbing +# --------------------------------------------------------------------------- + + +def test_scrub_query_off(): + assert scrub_query_string("a=1&token=x", KeyValueCollectionBehavior("off")) is None + + +def test_scrub_query_denylist(): + out = scrub_query_string("token=abc&page=5", KeyValueCollectionBehavior("denyList")) + assert "page=5" in out + assert "token=" in out + assert "abc" not in out + + +def test_scrub_query_allowlist(): + out = scrub_query_string( + "token=abc&page=5", KeyValueCollectionBehavior("allowList", ["page"]) + ) + assert "page=5" in out + assert "abc" not in out + + +# --------------------------------------------------------------------------- +# Body type collection +# --------------------------------------------------------------------------- + + +def test_body_type_default_all(): + dc = DataCollection() + # None means all valid types + assert should_collect_body_type(dc, "incomingRequest") is True + + +def test_body_type_explicit_list(): + dc = DataCollection(http_bodies=["incomingRequest"]) + assert should_collect_body_type(dc, "incomingRequest") is True + assert should_collect_body_type(dc, "outgoingRequest") is False + + +def test_body_type_empty_off(): + dc = DataCollection(http_bodies=[]) + assert should_collect_body_type(dc, "incomingRequest") is False + + +# --------------------------------------------------------------------------- +# Resolution: cases A / B / C / D +# --------------------------------------------------------------------------- + + +def _resolve(**options): + base = { + "data_collection": None, + "send_default_pii": None, + "include_local_variables": True, + "include_source_context": True, + } + base.update(options) + return resolve_data_collection(base) + + +def test_resolve_case_c_neither(): + dc = _resolve() + assert dc.explicit is False + assert dc.user_info is False + assert dc.gen_ai.inputs is False and dc.gen_ai.outputs is False + assert dc.cookies.mode == "off" + assert dc.query_params.mode == "off" + assert dc.http_headers.request.mode == "denyList" + assert dc.http_bodies == ALL_BODY_TYPES + assert dc.frame_context_lines == 5 + + +def test_resolve_case_b_pii_true(): + dc = _resolve(send_default_pii=True) + assert dc.explicit is False + assert dc.user_info is True + assert dc.gen_ai.inputs is True and dc.gen_ai.outputs is True + assert dc.cookies.mode == "denyList" + assert dc.query_params.mode == "denyList" + + +def test_resolve_case_b_pii_false(): + dc = _resolve(send_default_pii=False) + assert dc.explicit is False + assert dc.user_info is False + assert dc.cookies.mode == "off" + + +def test_resolve_case_a_defaults(): + dc = _resolve(data_collection=DataCollection()) + assert dc.explicit is True + # spec defaults: collect more + assert dc.user_info is True + assert dc.gen_ai.inputs is True and dc.gen_ai.outputs is True + assert dc.cookies.mode == "denyList" + assert dc.query_params.mode == "denyList" + assert dc.http_bodies == ALL_BODY_TYPES + + +def test_resolve_case_a_partial_uses_spec_defaults_for_omitted(): + dc = _resolve(data_collection=DataCollection(user_info=False, http_bodies=[])) + assert dc.explicit is True + assert dc.user_info is False + assert dc.http_bodies == [] + # omitted fields use spec defaults + assert dc.gen_ai.inputs is True + assert dc.cookies.mode == "denyList" + + +def test_resolve_case_a_frame_fallback_to_legacy_options(): + dc = _resolve( + data_collection=DataCollection(), + include_local_variables=False, + include_source_context=False, + ) + assert dc.stack_frame_variables is False + assert dc.frame_context_lines == 0 + + +def test_resolve_case_d_both_data_collection_wins_and_warns(): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + dc = _resolve( + send_default_pii=True, data_collection=DataCollection(user_info=False) + ) + assert dc.explicit is True + assert dc.user_info is False # data_collection wins + assert any(issubclass(w.category, DeprecationWarning) for w in caught) + + +def test_resolve_accepts_dict(): + dc = _resolve(data_collection={"user_info": False, "http_bodies": []}) + assert dc.explicit is True + assert dc.user_info is False + assert dc.http_bodies == [] + assert dc.gen_ai.inputs is True + + +def test_resolve_accepts_dict_with_nested_dicts(): + dc = _resolve( + data_collection={ + "cookies": "off", + "query_params": {"mode": "allowList", "terms": ["page"]}, + "http_headers": {"request": "off"}, + "gen_ai": {"inputs": False, "outputs": True}, + } + ) + assert dc.cookies.mode == "off" + assert dc.query_params.mode == "allowList" + assert dc.query_params.terms == ["page"] + assert dc.http_headers.request.mode == "off" + assert dc.http_headers.response.mode == "denyList" + assert dc.gen_ai.inputs is False + assert dc.gen_ai.outputs is True + + +def test_resolve_rejects_non_datacollection(): + with pytest.raises(TypeError): + _resolve(data_collection=42) + + +# --------------------------------------------------------------------------- +# frame_context_lines boolean fallback +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "value,expected", + [(True, 5), (False, 0), (3, 3), (0, 0)], +) +def test_frame_context_lines_bool_fallback(value, expected): + dc = _resolve(data_collection=DataCollection(frame_context_lines=value)) + assert dc.frame_context_lines == expected + + +# --------------------------------------------------------------------------- +# Client accessors +# --------------------------------------------------------------------------- + + +def test_client_accessors_case_c(): + sentry_sdk.init() + client = sentry_sdk.get_client() + assert client.should_collect_user_info() is False + assert client.should_send_default_pii() is False + assert client.should_collect_gen_ai_inputs() is False + assert client.should_collect_gen_ai_outputs() is False + + +def test_client_accessors_case_b_pii(): + sentry_sdk.init(send_default_pii=True) + client = sentry_sdk.get_client() + assert client.should_collect_user_info() is True + assert client.should_collect_gen_ai_inputs() is True + # include_prompts=False override still disables (legacy AND semantics) + assert client.should_collect_gen_ai_inputs(False) is False + assert client.should_collect_gen_ai_inputs(True) is True + + +def test_client_accessors_case_a(): + sentry_sdk.init(data_collection=DataCollection(user_info=False)) + client = sentry_sdk.get_client() + assert client.should_collect_user_info() is False + # gen_ai defaults to True in explicit mode + assert client.should_collect_gen_ai_inputs() is True + # explicit integration override wins + assert client.should_collect_gen_ai_inputs(False) is False + + +def test_client_accessors_gen_ai_explicit_override(): + sentry_sdk.init( + data_collection=DataCollection( + gen_ai=GenAICollection(inputs=False, outputs=True) + ) + ) + client = sentry_sdk.get_client() + assert client.should_collect_gen_ai_inputs() is False + assert client.should_collect_gen_ai_outputs() is True + # integration override beats the global gen_ai setting + assert client.should_collect_gen_ai_inputs(True) is True + + +def test_http_headers_collection_defaults(): + hh = HttpHeadersCollection() + assert hh.request.mode == "denyList" + assert hh.response.mode == "denyList"