Skip to content

Commit

Permalink
ref(tech-debt) Column validator also checks mapped columns (#4116)
Browse files Browse the repository at this point in the history
The EntityContainsColumnValidator wasn't checking the mapped columns when
validating if a column was allowed on the entity. Change this so that any mapped
columns are also marked as valid.

Also enable this on one dataset (search_issues) as a test. If everything works
correctly, the rest of the dataset will also get this validator.
  • Loading branch information
evanh authored May 15, 2023
1 parent 7a1e249 commit fe11e57
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,12 @@ query_processors:
project_field: project_id
- processor: BasicFunctionsProcessor

validate_data_model: warn
validators:
- validator: EntityRequiredColumnValidator
args:
required_filter_columns: ["project_id"]
- validator: TagConditionValidator
args: {}

required_time_column: timestamp
3 changes: 2 additions & 1 deletion snuba/datasets/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ def __init__(
self.__subscription_validators = subscription_validators
self.required_time_column = required_time_column

mappers = [s.translation_mappers for s in storages]
columns_exist_validator = EntityContainsColumnsValidator(
self.__data_model, validation_mode=validate_data_model
self.__data_model, mappers, validation_mode=validate_data_model
)
self.__validators = (
[*validators, columns_exist_validator]
Expand Down
3 changes: 2 additions & 1 deletion snuba/datasets/pluggable_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ class PluggableEntity(Entity):
subscription_validators: Optional[Sequence[EntitySubscriptionValidator]] = None

def _get_builtin_validators(self) -> Sequence[QueryValidator]:
mappers = [s.translation_mappers for s in self.storages]
return [
EntityContainsColumnsValidator(
EntityColumnSet(self.columns), self.validate_data_model
EntityColumnSet(self.columns), mappers, self.validate_data_model
)
]

Expand Down
15 changes: 14 additions & 1 deletion snuba/query/validation/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from enum import Enum
from typing import Optional, Sequence, Type, cast

from snuba.clickhouse.translators.snuba.mappers import ColumnToExpression
from snuba.clickhouse.translators.snuba.mapping import TranslationMappers
from snuba.datasets.entities.entity_data_model import EntityColumnSet
from snuba.environment import metrics as environment_metrics
from snuba.query import Query
Expand Down Expand Up @@ -108,10 +110,20 @@ class EntityContainsColumnsValidator(QueryValidator):
"""

def __init__(
self, entity_data_model: EntityColumnSet, validation_mode: ColumnValidationMode
self,
entity_data_model: EntityColumnSet,
mappers: list[TranslationMappers],
validation_mode: ColumnValidationMode,
) -> None:
self.validation_mode = validation_mode
self.entity_data_model = entity_data_model
# The entity can accept some column names that get mapped to other expressions
# Parse and store those mappings as well
self.mapped_columns = set()
for mapper in mappers:
for colmapping in mapper.columns:
if isinstance(colmapping, ColumnToExpression):
self.mapped_columns.add(colmapping.from_col_name)

def validate(self, query: Query, alias: Optional[str] = None) -> None:
if self.validation_mode == ColumnValidationMode.DO_NOTHING:
Expand All @@ -124,6 +136,7 @@ def validate(self, query: Query, alias: Optional[str] = None) -> None:
if (
column.table_name == alias
and column.column_name not in self.entity_data_model
and column.column_name not in self.mapped_columns
):
missing.add(column.column_name)

Expand Down
5 changes: 5 additions & 0 deletions snuba/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,13 +239,16 @@ def __init__(self, columns: Sequence[Column[SchemaModifiers]]) -> None:

self._lookup: MutableMapping[str, FlattenedColumn] = {}
self._flattened: List[FlattenedColumn] = []
self._nested = {}
for column in self.__columns:
if not isinstance(column, WildcardColumn):
self._flattened.extend(column.type.flatten(column.name))

for col in self._flattened:
if col.flattened in self._lookup:
raise RuntimeError("Duplicate column: {}".format(col.flattened))
if isinstance(column.type, Nested):
self._nested[column.name] = column

self._lookup[col.flattened] = col
# also store it by the escaped name
Expand Down Expand Up @@ -283,6 +286,8 @@ def get(
def __contains__(self, key: str) -> bool:
if key in self._lookup:
return True
if key in self._nested:
return True

if self._wildcard_columns:
match = NESTED_COL_EXPR_RE.match(key)
Expand Down
65 changes: 65 additions & 0 deletions tests/datasets/configuration/column_validation_entity.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
version: v1
kind: entity
name: nested

schema:
- name: event_id
type: String
- name: fixed_event_id
type: FixedString
args:
length: 420
- name: project_id
type: UInt
args:
size: 64
- name: primary_hash
type: FixedString
args:
length: 69
schema_modifiers:
- nullable
- name: tags
type: Nested
args:
subcolumns:
- name: key,
type: String
- name: value
type: String
storages:
- storage: discover
translation_mappers:
columns:
-
mapper: ColumnToIPAddress
args:
from_table_name:
from_col_name: ip_address
-
mapper: ColumnToNullIf
args:
from_table_name:
from_col_name: userColumnToNullIf
-
mapper: ColumnToMapping
args:
from_table_name:
from_col_name: geo_country_code
to_nested_col_table_name:
to_nested_col_name: contexts
to_nested_mapping_key: geo.country_code
nullable: True
-
mapper: ColumnToColumn
args:
from_table_name:
from_col_name: email
to_table_name:
to_col_name: user_email
is_writable: false
required_time_column: event_id
storage_selector:
selector: DefaultQueryStorageSelector
query_processors: []
validators: []
62 changes: 43 additions & 19 deletions tests/datasets/validation/test_entity_contains_columns_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from snuba.datasets.configuration.entity_builder import build_entity_from_config
from snuba.datasets.entity import Entity
from snuba.query import SelectedExpression
from snuba.query.data_source.simple import Entity as QueryEntity
from snuba.query.exceptions import InvalidQueryException
Expand All @@ -11,17 +12,47 @@
EntityContainsColumnsValidator,
)

entity_contains_columns_tests = [
pytest.param(
"tests/datasets/configuration/entity_with_fixed_string.yaml",
id="Validate Entity Columns",
CONFIG_PATH = "tests/datasets/configuration/column_validation_entity.yaml"


def get_validator(entity: Entity) -> EntityContainsColumnsValidator:
validator = None
for v in entity.get_validators():
if isinstance(v, EntityContainsColumnsValidator):
validator = v

assert validator is not None
validator.validation_mode = ColumnValidationMode.ERROR
return validator


def test_mapped_columns_validation() -> None:
entity = build_entity_from_config(CONFIG_PATH)

query_entity = QueryEntity(entity.entity_key, entity.get_data_model())
columns = [
SelectedExpression(
"ip_address", Column("_snuba_ip_address", None, "ip_address")
),
SelectedExpression("email", Column("_snuba_email", None, "email")),
]

bad_query = LogicalQuery(
query_entity,
selected_columns=columns
+ [SelectedExpression("asdf", Column("_snuba_asdf", None, "asdf"))],
)
]

good_query = LogicalQuery(query_entity, selected_columns=columns)
validator = get_validator(entity)
with pytest.raises(InvalidQueryException):
validator.validate(bad_query)

validator.validate(good_query)


@pytest.mark.parametrize("config_path", entity_contains_columns_tests)
def test_outcomes_columns_validation(config_path: str) -> None:
entity = build_entity_from_config(config_path)
def test_outcomes_columns_validation() -> None:
entity = build_entity_from_config(CONFIG_PATH)

query_entity = QueryEntity(entity.entity_key, entity.get_data_model())

Expand All @@ -47,20 +78,15 @@ def test_outcomes_columns_validation(config_path: str) -> None:
for column in entity.get_data_model().columns
],
)

validator = EntityContainsColumnsValidator(
entity.get_data_model(), validation_mode=ColumnValidationMode.ERROR
)

validator = get_validator(entity)
with pytest.raises(InvalidQueryException):
validator.validate(bad_query)

validator.validate(good_query)


@pytest.mark.parametrize("config_path", entity_contains_columns_tests)
def test_in_where_clause_and_function(config_path):
entity = build_entity_from_config(config_path)
def test_in_where_clause_and_function() -> None:
entity = build_entity_from_config(CONFIG_PATH)

query_entity = QueryEntity(entity.entity_key, entity.get_data_model())

Expand All @@ -76,9 +102,7 @@ def test_in_where_clause_and_function(config_path):
],
condition=FunctionCall(None, "f1", (Column(None, None, "bad_column"),)),
)
validator = EntityContainsColumnsValidator(
entity.get_data_model(), validation_mode=ColumnValidationMode.ERROR
)
validator = get_validator(entity)

with pytest.raises(InvalidQueryException):
validator.validate(bad_query)

0 comments on commit fe11e57

Please sign in to comment.