Merge "fix tag regexp to match quoted groups correctly" into main

commit: b5ac84bafc80c3e56a1ede317b7ae4173e762a4a [log] [tgz]
author: mike bayer <mike_mp@zzzcomputing.com> Mon Aug 29 17:59:10 2022 +0000
committer: Gerrit Code Review <gerrit@ci3.zzzcomputing.com> Mon Aug 29 17:59:10 2022 +0000
tree: 6ed749d86ca00b55aa8c22e49db352bbc2f1de2c
parent: dbbaad3918c7d19cb71ca4b0b7ebe12661fba47b [diff]
parent: 925760291d6efec64fda6e9dd1fd9cfbd5be068c [diff]
diff --git a/doc/build/unreleased/366.rst b/doc/build/unreleased/366.rst
new file mode 100644
index 0000000..27b0278
--- /dev/null
+++ b/doc/build/unreleased/366.rst

@@ -0,0 +1,9 @@
+.. change::
+    :tags: bug, lexer
+    :tickets: 366
+
+    Fixed issue in lexer where the regexp used to match tags would not
+    correctly interpret quoted sections individually. While this parsing issue
+    still produced the same expected tag structure later on, the mis-handling
+    of quoted sections was also subject to a regexp crash if a tag had a large
+    number of quotes within its quoted sections.
\ No newline at end of file

diff --git a/mako/lexer.py b/mako/lexer.py
index bfcf286..77a2483 100644
--- a/mako/lexer.py
+++ b/mako/lexer.py

@@ -272,20 +272,24 @@
         return self.template
 
     def match_tag_start(self):
-        match = self.match(
-            r"""
+        reg = r"""
             \<%     # opening tag
 
             ([\w\.\:]+)   # keyword
 
-            ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*)  # attrname, = \
+            ((?:\s+\w+|\s*=\s*|"[^"]*?"|'[^']*?'|\s*,\s*)*)  # attrname, = \
                                                #        sign, string expression
+                                               # comma is for backwards compat
+                                               # identified in #366
 
             \s*     # more whitespace
 
             (/)?>   # closing
 
-            """,
+        """
+
+        match = self.match(
+            reg,
             re.I | re.S | re.X,
         )
 

diff --git a/test/test_lexer.py b/test/test_lexer.py
index 255c128..a7b6fe3 100644
--- a/test/test_lexer.py
+++ b/test/test_lexer.py

@@ -1,5 +1,7 @@
 import re
 
+import pytest
+
 from mako import compat
 from mako import exceptions
 from mako import parsetree
@@ -146,6 +148,10 @@
         """
         assert_raises(exceptions.CompileException, Lexer(template).parse)
 
+    def test_tag_many_quotes(self):
+        template = "<%0" + '"' * 3000
+        assert_raises(exceptions.SyntaxException, Lexer(template).parse)
+
     def test_unmatched_tag(self):
         template = """
         <%namespace name="bar">
@@ -432,9 +438,16 @@
             ),
         )
 
-    def test_pagetag(self):
-        template = """
-            <%page cached="True", args="a, b"/>
+    @pytest.mark.parametrize("comma,numchars", [(",", 48), ("", 47)])
+    def test_pagetag(self, comma, numchars):
+        # note that the comma here looks like:
+        # <%page cached="True", args="a, b"/>
+        # that's what this test has looked like for decades, however, the
+        # comma there is not actually the right syntax.  When issue #366
+        # was fixed, the reg was altered to accommodate for this comma to allow
+        # backwards compat
+        template = f"""
+            <%page cached="True"{comma} args="a, b"/>
 
             some template
         """
@@ -453,7 +466,7 @@
 
             some template
         """,
-                        (2, 48),
+                        (2, numchars),
                     ),
                 ],
             ),
commit	b5ac84bafc80c3e56a1ede317b7ae4173e762a4a	[log] [tgz]
author	mike bayer <mike_mp@zzzcomputing.com>	Mon Aug 29 17:59:10 2022 +0000
committer	Gerrit Code Review <gerrit@ci3.zzzcomputing.com>	Mon Aug 29 17:59:10 2022 +0000
tree	6ed749d86ca00b55aa8c22e49db352bbc2f1de2c
parent	dbbaad3918c7d19cb71ca4b0b7ebe12661fba47b [diff]
parent	925760291d6efec64fda6e9dd1fd9cfbd5be068c [diff]