Skip to content

Commit

Permalink
Add attribute noting details of scrubbed values (#278)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexmojaki committed Jul 5, 2024
1 parent 02f240f commit b04c3a3
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 35 deletions.
3 changes: 3 additions & 0 deletions logfire-api/logfire_api/_internal/constants.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ LOGGING_TO_OTEL_LEVEL_NUMBERS: Incomplete
ATTRIBUTES_LOG_LEVEL_NAME_KEY: Incomplete
ATTRIBUTES_LOG_LEVEL_NUM_KEY: Incomplete


def log_level_attributes(level: LevelName | int) -> dict[str, otel_types.AttributeValue]: ...


SpanTypeType: Incomplete
ATTRIBUTES_SPAN_TYPE_KEY: Incomplete
ATTRIBUTES_PENDING_SPAN_REAL_PARENT_KEY: Incomplete
Expand All @@ -21,6 +23,7 @@ DISABLE_CONSOLE_KEY: Incomplete
ATTRIBUTES_JSON_SCHEMA_KEY: Incomplete
ATTRIBUTES_LOGGING_ARGS_KEY: Incomplete
ATTRIBUTES_VALIDATION_ERROR_KEY: str
ATTRIBUTES_SCRUBBED_KEY: str
NULL_ARGS_KEY: str
PENDING_SPAN_NAME_SUFFIX: str
LOGFIRE_BASE_URL: str
Expand Down
4 changes: 2 additions & 2 deletions logfire-api/logfire_api/_internal/formatter.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import ast
import executing
import types
from .constants import MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT as MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT
from .scrubbing import BaseScrubber as BaseScrubber
from .constants import ATTRIBUTES_SCRUBBED_KEY as ATTRIBUTES_SCRUBBED_KEY, MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT as MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT
from .scrubbing import BaseScrubber as BaseScrubber, ScrubbedNote as ScrubbedNote
from .utils import truncate_string as truncate_string
from _typeshed import Incomplete
from logfire._internal.stack_info import get_user_frame_and_stacklevel as get_user_frame_and_stacklevel
Expand Down
57 changes: 51 additions & 6 deletions logfire-api/logfire_api/_internal/scrubbing.pyi
Original file line number Diff line number Diff line change
@@ -1,46 +1,91 @@
import re
from .constants import ATTRIBUTES_JSON_SCHEMA_KEY as ATTRIBUTES_JSON_SCHEMA_KEY, ATTRIBUTES_LOG_LEVEL_NAME_KEY as ATTRIBUTES_LOG_LEVEL_NAME_KEY, ATTRIBUTES_LOG_LEVEL_NUM_KEY as ATTRIBUTES_LOG_LEVEL_NUM_KEY, ATTRIBUTES_MESSAGE_KEY as ATTRIBUTES_MESSAGE_KEY, ATTRIBUTES_MESSAGE_TEMPLATE_KEY as ATTRIBUTES_MESSAGE_TEMPLATE_KEY, ATTRIBUTES_PENDING_SPAN_REAL_PARENT_KEY as ATTRIBUTES_PENDING_SPAN_REAL_PARENT_KEY, ATTRIBUTES_SAMPLE_RATE_KEY as ATTRIBUTES_SAMPLE_RATE_KEY, ATTRIBUTES_SPAN_TYPE_KEY as ATTRIBUTES_SPAN_TYPE_KEY, ATTRIBUTES_TAGS_KEY as ATTRIBUTES_TAGS_KEY, NULL_ARGS_KEY as NULL_ARGS_KEY, RESOURCE_ATTRIBUTES_PACKAGE_VERSIONS as RESOURCE_ATTRIBUTES_PACKAGE_VERSIONS
import typing_extensions
from .constants import (
ATTRIBUTES_JSON_SCHEMA_KEY as ATTRIBUTES_JSON_SCHEMA_KEY,
ATTRIBUTES_LOG_LEVEL_NAME_KEY as ATTRIBUTES_LOG_LEVEL_NAME_KEY,
ATTRIBUTES_LOG_LEVEL_NUM_KEY as ATTRIBUTES_LOG_LEVEL_NUM_KEY,
ATTRIBUTES_MESSAGE_KEY as ATTRIBUTES_MESSAGE_KEY,
ATTRIBUTES_MESSAGE_TEMPLATE_KEY as ATTRIBUTES_MESSAGE_TEMPLATE_KEY,
ATTRIBUTES_PENDING_SPAN_REAL_PARENT_KEY as ATTRIBUTES_PENDING_SPAN_REAL_PARENT_KEY,
ATTRIBUTES_SAMPLE_RATE_KEY as ATTRIBUTES_SAMPLE_RATE_KEY,
ATTRIBUTES_SCRUBBED_KEY as ATTRIBUTES_SCRUBBED_KEY,
ATTRIBUTES_SPAN_TYPE_KEY as ATTRIBUTES_SPAN_TYPE_KEY,
ATTRIBUTES_TAGS_KEY as ATTRIBUTES_TAGS_KEY,
NULL_ARGS_KEY as NULL_ARGS_KEY,
RESOURCE_ATTRIBUTES_PACKAGE_VERSIONS as RESOURCE_ATTRIBUTES_PACKAGE_VERSIONS,
)
from .stack_info import STACK_INFO_KEYS as STACK_INFO_KEYS
from .utils import ReadableSpanDict as ReadableSpanDict
from _typeshed import Incomplete
from abc import ABC, abstractmethod
from dataclasses import dataclass
from opentelemetry.sdk.trace import Event
from typing import Any, Callable, Sequence
from typing import Any, Callable, Sequence, TypedDict

DEFAULT_PATTERNS: Incomplete
JsonPath: typing_extensions.TypeAlias = 'tuple[str | int, ...]'


@dataclass
class ScrubMatch:
"""An object passed to the [`scrubbing_callback`][logfire.configure(scrubbing_callback)] function."""
path: tuple[str | int, ...]

path: JsonPath
value: Any
pattern_match: re.Match[str]


ScrubCallback = Callable[[ScrubMatch], Any]


class ScrubbedNote(TypedDict):
path: JsonPath
matched_substring: str


@dataclass
class ScrubbingOptions:
"""Options for redacting sensitive data."""

callback: ScrubCallback | None = ...
extra_patterns: Sequence[str] | None = ...


class BaseScrubber(ABC):
SAFE_KEYS: Incomplete

@abstractmethod
def scrub_span(self, span: ReadableSpanDict): ...
@abstractmethod
def scrub(self, path: tuple[str | int, ...], value: Any) -> Any: ...
def scrub_value(self, path: JsonPath, value: Any) -> tuple[Any, list[ScrubbedNote]]: ...


class NoopScrubber(BaseScrubber):
def scrub_span(self, span: ReadableSpanDict): ...
def scrub(self, path: tuple[str | int, ...], value: Any) -> Any: ...
def scrub_value(self, path: JsonPath, value: Any) -> tuple[Any, list[ScrubbedNote]]: ...


class Scrubber(BaseScrubber):
"""Redacts potentially sensitive data."""

def __init__(self, patterns: Sequence[str] | None, callback: ScrubCallback | None = None) -> None: ...
def scrub_span(self, span: ReadableSpanDict): ...
def scrub_value(self, path: JsonPath, value: Any) -> tuple[Any, list[ScrubbedNote]]: ...


class SpanScrubber:
"""Does the actual scrubbing work.
This class is separate from Scrubber so that it can be instantiated more regularly
and hold and mutate state about the span being scrubbed, specifically the scrubbed notes.
"""

scrubbed: list[ScrubbedNote]

def __init__(self, parent: Scrubber) -> None: ...
def scrub_span(self, span: ReadableSpanDict): ...
def scrub_event_attributes(self, event: Event, index: int): ...
def scrub(self, path: tuple[str | int, ...], value: Any) -> Any:
def scrub(self, path: JsonPath, value: Any) -> Any:
"""Redacts sensitive data from `value`, recursing into nested sequences and mappings.
`path` is a list of keys and indices leading to `value` in the span.
Expand Down
3 changes: 3 additions & 0 deletions logfire/_internal/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ def log_level_attributes(level: LevelName | int) -> dict[str, otel_types.Attribu
ATTRIBUTES_VALIDATION_ERROR_KEY = 'exception.logfire.data'
"""The key within OTEL attributes where logfire puts validation errors."""

ATTRIBUTES_SCRUBBED_KEY = f'{LOGFIRE_ATTRIBUTES_NAMESPACE}.scrubbed'
"""Key in OTEL attributes with metadata about parts of a span that have been scrubbed."""

NULL_ARGS_KEY = 'logfire.null_args'
"""Key in OTEL attributes that collects attributes with a null (None) value."""

Expand Down
35 changes: 22 additions & 13 deletions logfire/_internal/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
import logfire
from logfire._internal.stack_info import get_user_frame_and_stacklevel

from .constants import MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT
from .scrubbing import BaseScrubber
from .constants import ATTRIBUTES_SCRUBBED_KEY, MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT
from .scrubbing import BaseScrubber, ScrubbedNote
from .utils import truncate_string


Expand Down Expand Up @@ -46,20 +46,21 @@ def chunks(
# Returns
# 1. A list of chunks
# 2. A dictionary of extra attributes to add to the span/log.
# These can come from evaluating values in f-strings.
# These can come from evaluating values in f-strings,
# or from noting scrubbed values.
# 3. The final message template, which may differ from `format_string` if it was an f-string.
if fstring_frame:
result = self._fstring_chunks(kwargs, scrubber, fstring_frame)
if result: # returns None if failed
return result

chunks = self._vformat_chunks(
chunks, extra_attrs = self._vformat_chunks(
format_string,
kwargs=kwargs,
scrubber=scrubber,
)
# When there's no f-string magic, there's no extra attributes or changes in the template string.
return chunks, {}, format_string
# When there's no f-string magic, there's no changes in the template string.
return chunks, extra_attrs, format_string

def _fstring_chunks(
self,
Expand Down Expand Up @@ -197,6 +198,7 @@ def _fstring_chunks(
new_template = ''

extra_attrs: dict[str, Any] = {}
scrubbed: list[ScrubbedNote] = []
for node_value in arg_node.values:
if isinstance(node_value, ast.Constant):
# These are the parts of the f-string not enclosed by `{}`, e.g. 'foo ' in f'foo {bar}'
Expand Down Expand Up @@ -224,9 +226,12 @@ def _fstring_chunks(

# Format the value according to the format spec, converting to a string.
formatted = eval(formatted_code, global_vars, {**local_vars, '@fvalue': value})
formatted = self._clean_value(source, formatted, scrubber)
formatted, value_scrubbed = self._clean_value(source, formatted, scrubber)
scrubbed += value_scrubbed
result.append({'v': formatted, 't': 'arg'})

if scrubbed:
extra_attrs[ATTRIBUTES_SCRUBBED_KEY] = scrubbed
return result, extra_attrs, new_template

def _vformat_chunks(
Expand All @@ -237,7 +242,7 @@ def _vformat_chunks(
scrubber: BaseScrubber,
recursion_depth: int = 2,
auto_arg_index: int = 0,
) -> list[LiteralChunk | ArgChunk]:
) -> tuple[list[LiteralChunk | ArgChunk], dict[str, Any]]:
"""Copied from `string.Formatter._vformat` https://github.com/python/cpython/blob/v3.11.4/Lib/string.py#L198-L247 then altered."""
if recursion_depth < 0: # pragma: no cover
raise ValueError('Max string recursion exceeded')
Expand All @@ -246,6 +251,7 @@ def _vformat_chunks(
used_args: set[str | int] = set()
# We currently don't use positional arguments
args = ()
scrubbed: list[ScrubbedNote] = []
for literal_text, field_name, format_spec, conversion in self.parse(format_string):
# output the literal text
if literal_text:
Expand Down Expand Up @@ -313,21 +319,24 @@ def _vformat_chunks(
value = self.NONE_REPR
else:
value = self.format_field(obj, format_spec)
value = self._clean_value(field_name, value, scrubber)
value, value_scrubbed = self._clean_value(field_name, value, scrubber)
scrubbed += value_scrubbed
d: ArgChunk = {'v': value, 't': 'arg'}
if format_spec:
d['spec'] = format_spec
result.append(d)

return result
extra_attrs = {ATTRIBUTES_SCRUBBED_KEY: scrubbed} if scrubbed else {}
return result, extra_attrs

def _clean_value(self, field_name: str, value: str, scrubber: BaseScrubber) -> str:
def _clean_value(self, field_name: str, value: str, scrubber: BaseScrubber) -> tuple[str, list[ScrubbedNote]]:
# Scrub before truncating so that the scrubber can see the full value.
# For example, if the value contains 'password=123' and 'password' is replaced by '...'
# because of truncation, then that leaves '=123' in the message, which is not good.
scrubbed: list[ScrubbedNote] = []
if field_name not in scrubber.SAFE_KEYS:
value = scrubber.scrub(('message', field_name), value)
return truncate_string(value, max_length=MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT)
value, scrubbed = scrubber.scrub_value(('message', field_name), value)
return truncate_string(value, max_length=MESSAGE_FORMATTED_VALUE_LENGTH_LIMIT), scrubbed


chunks_formatter = ChunksFormatter()
Expand Down
13 changes: 9 additions & 4 deletions logfire/_internal/json_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from types import GeneratorType
from typing import Any, Callable, Iterable, Mapping, NewType, Sequence, cast

from .constants import ATTRIBUTES_SCRUBBED_KEY
from .json_encoder import is_attrs, is_sqlalchemy, to_json_value
from .stack_info import STACK_INFO_KEYS
from .utils import JsonDict, dump_json, safe_repr
Expand Down Expand Up @@ -154,13 +155,17 @@ def attributes_json_schema(properties: JsonSchemaProperties) -> str:
# This becomes the value of `properties` above.
def attributes_json_schema_properties(attributes: dict[str, Any]) -> JsonSchemaProperties:
return JsonSchemaProperties(
# NOTE: The code related attributes are merged with the logfire function attributes on
# `install_auto_tracing` and when using our stdlib logging handler. We need to remove them
# from the JSON Schema, as we only want to have the ones that the user passes in.
{key: create_json_schema(value, set()) for key, value in attributes.items() if key not in STACK_INFO_KEYS}
{key: create_json_schema(value, set()) for key, value in attributes.items() if key not in EXCLUDE_KEYS}
)


# Attributes from STACK_INFO_KEYS are merged with the logfire function attributes on
# `install_auto_tracing` and when using our stdlib logging handler. We need to remove them
# from the JSON Schema, as we only want to have the ones that the user passes in.
# ATTRIBUTES_SCRUBBED_KEY can be set when formatting a message.
EXCLUDE_KEYS = STACK_INFO_KEYS | {ATTRIBUTES_SCRUBBED_KEY}


def _dataclass_schema(obj: Any, seen: set[int]) -> JsonDict:
# NOTE: The `x-python-datatype` is "dataclass" for both standard dataclasses and Pydantic dataclasses.
# We don't need to distinguish between them on the frontend, or to reconstruct the type on the JSON formatter.
Expand Down
Loading

0 comments on commit b04c3a3

Please sign in to comment.