feat(tokenizer): Allow underscore separated number literals (#4536)

* feat(tokenizer): Allow underscore separated number literals * PR Feedback 1
tobymao · Dec 19, 2024 · 6992c18 · 6992c18
1 parent 7a517d7
commit 6992c18
Show file tree

Hide file tree

Showing 7 changed files with 28 additions and 2 deletions.
diff --git a/sqlglot/dialects/clickhouse.py b/sqlglot/dialects/clickhouse.py
@@ -166,6 +166,8 @@ class ClickHouse(Dialect):
     LOG_BASE_FIRST: t.Optional[bool] = None
     FORCE_EARLY_ALIAS_REF_EXPANSION = True
     PRESERVE_ORIGINAL_NAMES = True
+    NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True
+    IDENTIFIERS_CAN_START_WITH_DIGIT = True
 
     # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779
     NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE

diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py
@@ -420,6 +420,9 @@ class Dialect(metaclass=_Dialect):
     SUPPORTS_VALUES_DEFAULT = True
     """Whether the DEFAULT keyword is supported in the VALUES clause."""
 
+    NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = False
+    """Whether number literals can include underscores for better readability"""
+
     REGEXP_EXTRACT_DEFAULT_GROUP = 0
     """The default value for the capturing group."""
 

diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py
@@ -997,6 +997,7 @@ def __init__(self, dialect: DialectType = None) -> None:
             self._rs_dialect_settings = RsTokenizerDialectSettings(
                 unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES,
                 identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT,
+                numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED,
             )
 
         self.reset()
@@ -1300,8 +1301,12 @@ def _scan_number(self) -> None:
                     self._add(TokenType.NUMBER, number_text)
                     self._add(TokenType.DCOLON, "::")
                     return self._add(token_type, literal)
-                elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
-                    return self._add(TokenType.VAR)
+                else:
+                    replaced = literal.replace("_", "")
+                    if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit():
+                        return self._add(TokenType.NUMBER, number_text + replaced)
+                    if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT:
+                        return self._add(TokenType.VAR)
 
                 self._advance(-len(literal))
                 return self._add(TokenType.NUMBER, number_text)

diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs
@@ -171,6 +171,7 @@ impl TokenizerSettings {
 pub struct TokenizerDialectSettings {
     pub unescaped_sequences: HashMap<String, String>,
     pub identifiers_can_start_with_digit: bool,
+    pub numbers_can_be_underscore_separated: bool,
 }
 
 #[pymethods]
@@ -179,10 +180,12 @@ impl TokenizerDialectSettings {
     pub fn new(
         unescaped_sequences: HashMap<String, String>,
         identifiers_can_start_with_digit: bool,
+        numbers_can_be_underscore_separated: bool,
     ) -> Self {
         TokenizerDialectSettings {
             unescaped_sequences,
             identifiers_can_start_with_digit,
+            numbers_can_be_underscore_separated,
         }
     }
 }
diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs
@@ -531,10 +531,14 @@ impl<'a> TokenizerState<'a> {
                     )
                     .map(|x| *x);
 
+                let replaced = literal.replace("_", "");
+
                 if let Some(unwrapped_token_type) = token_type {
                     self.add(self.token_types.number, Some(number_text))?;
                     self.add(self.token_types.dcolon, Some("::".to_string()))?;
                     self.add(unwrapped_token_type, Some(literal))?;
+                } else if self.dialect_settings.numbers_can_be_underscore_separated && self.is_numeric(&replaced) {
+                    self.add(self.token_types.number, Some(number_text + &replaced))?;
                 } else if self.dialect_settings.identifiers_can_start_with_digit {
                     self.add(self.token_types.var, None)?;
                 } else {
@@ -706,6 +710,10 @@ impl<'a> TokenizerState<'a> {
         )
     }
 
+    fn is_numeric(&mut self, s: &str) -> bool {
+        s.chars().all(|c| c.is_digit(10))
+    }
+
     fn extract_value(&mut self) -> Result<String, TokenizerError> {
         loop {
             if !self.peek_char.is_whitespace()

diff --git a/tests/dialects/test_clickhouse.py b/tests/dialects/test_clickhouse.py
@@ -549,6 +549,9 @@ def test_clickhouse(self):
             "SELECT name FROM data WHERE NOT ((SELECT DISTINCT name FROM data) IS NULL)",
         )
 
+        self.validate_identity("SELECT 1_2_3_4_5", "SELECT 12345")
+        self.validate_identity("SELECT 1_b", "SELECT 1_b")
+
     def test_clickhouse_values(self):
         values = exp.select("*").from_(
             exp.values([exp.tuple_(1, 2, 3)], alias="subq", columns=["a", "b", "c"])

diff --git a/tests/dialects/test_hive.py b/tests/dialects/test_hive.py
@@ -806,6 +806,8 @@ def test_hive(self):
             },
         )
 
+        self.validate_identity("SELECT 1_2")
+
     def test_escapes(self) -> None:
         self.validate_identity("'\n'", "'\\n'")
         self.validate_identity("'\\n'")