Skip to content

Commit

Permalink
feat(tokenizer): Allow underscore separated number literals (#4536)
Browse files Browse the repository at this point in the history
* feat(tokenizer): Allow underscore separated number literals

* PR Feedback 1
  • Loading branch information
VaggelisD authored Dec 19, 2024
1 parent 7a517d7 commit 6992c18
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 2 deletions.
2 changes: 2 additions & 0 deletions sqlglot/dialects/clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ class ClickHouse(Dialect):
LOG_BASE_FIRST: t.Optional[bool] = None
FORCE_EARLY_ALIAS_REF_EXPANSION = True
PRESERVE_ORIGINAL_NAMES = True
NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True
IDENTIFIERS_CAN_START_WITH_DIGIT = True

# https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779
NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE
Expand Down
3 changes: 3 additions & 0 deletions sqlglot/dialects/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,9 @@ class Dialect(metaclass=_Dialect):
SUPPORTS_VALUES_DEFAULT = True
"""Whether the DEFAULT keyword is supported in the VALUES clause."""

NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = False
"""Whether number literals can include underscores for better readability"""

REGEXP_EXTRACT_DEFAULT_GROUP = 0
"""The default value for the capturing group."""

Expand Down
9 changes: 7 additions & 2 deletions sqlglot/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,7 @@ def __init__(self, dialect: DialectType = None) -> None:
self._rs_dialect_settings = RsTokenizerDialectSettings(
unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
)

self.reset()
Expand Down Expand Up @@ -1300,8 +1301,12 @@ def _scan_number(self) -> None:
self._add(TokenType.NUMBER, number_text)
self._add(TokenType.DCOLON, "::")
return self._add(token_type, literal)
elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
return self._add(TokenType.VAR)
else:
replaced = literal.replace("_", "")
if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit():
return self._add(TokenType.NUMBER, number_text + replaced)
if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
return self._add(TokenType.VAR)

self._advance(-len(literal))
return self._add(TokenType.NUMBER, number_text)
Expand Down
3 changes: 3 additions & 0 deletions sqlglotrs/src/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ impl TokenizerSettings {
pub struct TokenizerDialectSettings {
pub unescaped_sequences: HashMap<String, String>,
pub identifiers_can_start_with_digit: bool,
pub numbers_can_be_underscore_separated: bool,
}

#[pymethods]
Expand All @@ -179,10 +180,12 @@ impl TokenizerDialectSettings {
pub fn new(
unescaped_sequences: HashMap<String, String>,
identifiers_can_start_with_digit: bool,
numbers_can_be_underscore_separated: bool,
) -> Self {
TokenizerDialectSettings {
unescaped_sequences,
identifiers_can_start_with_digit,
numbers_can_be_underscore_separated,
}
}
}
8 changes: 8 additions & 0 deletions sqlglotrs/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -531,10 +531,14 @@ impl<'a> TokenizerState<'a> {
)
.map(|x| *x);

let replaced = literal.replace("_", "");

if let Some(unwrapped_token_type) = token_type {
self.add(self.token_types.number, Some(number_text))?;
self.add(self.token_types.dcolon, Some("::".to_string()))?;
self.add(unwrapped_token_type, Some(literal))?;
} else if self.dialect_settings.numbers_can_be_underscore_separated && self.is_numeric(&replaced) {
self.add(self.token_types.number, Some(number_text + &replaced))?;
} else if self.dialect_settings.identifiers_can_start_with_digit {
self.add(self.token_types.var, None)?;
} else {
Expand Down Expand Up @@ -706,6 +710,10 @@ impl<'a> TokenizerState<'a> {
)
}

fn is_numeric(&mut self, s: &str) -> bool {
s.chars().all(|c| c.is_digit(10))
}

fn extract_value(&mut self) -> Result<String, TokenizerError> {
loop {
if !self.peek_char.is_whitespace()
Expand Down
3 changes: 3 additions & 0 deletions tests/dialects/test_clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,9 @@ def test_clickhouse(self):
"SELECT name FROM data WHERE NOT ((SELECT DISTINCT name FROM data) IS NULL)",
)

self.validate_identity("SELECT 1_2_3_4_5", "SELECT 12345")
self.validate_identity("SELECT 1_b", "SELECT 1_b")

def test_clickhouse_values(self):
values = exp.select("*").from_(
exp.values([exp.tuple_(1, 2, 3)], alias="subq", columns=["a", "b", "c"])
Expand Down
2 changes: 2 additions & 0 deletions tests/dialects/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,8 @@ def test_hive(self):
},
)

self.validate_identity("SELECT 1_2")

def test_escapes(self) -> None:
self.validate_identity("'\n'", "'\\n'")
self.validate_identity("'\\n'")
Expand Down

0 comments on commit 6992c18

Please sign in to comment.