Skip to content

Commit 6148386

Browse files
committed
Add support for (?<name>expr).
This follows golang/go@ee61186 to some extent. I took the opportunity to simplify the parsing logic and also fixed a bug in `Regexp::Equal()` that had gone unnoticed... Change-Id: I90abec942d39b02a1c6d1ac95cd3b1cc66ec7b2a Reviewed-on: https://code-review.googlesource.com/c/re2/+/61690 Reviewed-by: Alex Chernyakhovsky <achernya@google.com> Reviewed-by: Paul Wankadia <junyer@google.com>
1 parent cb000a8 commit 6148386

File tree

5 files changed

+40
-16
lines changed

5 files changed

+40
-16
lines changed

doc/syntax.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ <h1>RE2 regular expression syntax reference</h1>
6262
<tr><td colspan=2><b>Grouping:</b></td></tr>
6363
<tr><td><code>(re)</code></td><td>numbered capturing group (submatch)</td></tr>
6464
<tr><td><code>(?P&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
65-
<tr><td><code><font color=#808080>(?&lt;name&gt;re)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
65+
<tr><td><code>(?&lt;name&gt;re)</code></td><td>named &amp; numbered capturing group (submatch)</td></tr>
6666
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named &amp; numbered capturing group (submatch) </td></tr>
6767
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
6868
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>

doc/syntax.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ x{n}+ exactly «n» «x», possessive NOT SUPPORTED
5151
Grouping:
5252
(re) numbered capturing group (submatch)
5353
(?P<name>re) named & numbered capturing group (submatch)
54-
(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED
54+
(?<name>re) named & numbered capturing group (submatch)
5555
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
5656
(?:re) non-capturing group
5757
(?flags) set flags within current group; non-capturing

re2/parse.cc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2059,8 +2059,6 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
20592059
return false;
20602060
}
20612061

2062-
t.remove_prefix(2); // "(?"
2063-
20642062
// Check for named captures, first introduced in Python's regexp library.
20652063
// As usual, there are three slightly different syntaxes:
20662064
//
@@ -2074,22 +2072,23 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
20742072
// support all three as well. EcmaScript 4 uses only the Python form.
20752073
//
20762074
// In both the open source world (via Code Search) and the
2077-
// Google source tree, (?P<expr>name) is the dominant form,
2078-
// so that's the one we implement. One is enough.
2079-
if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
2075+
// Google source tree, (?P<name>expr) and (?<name>expr) are the
2076+
// dominant forms of named captures and both are supported.
2077+
if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
2078+
(t.size() > 3 && t[2] == '<')) {
20802079
// Pull out name.
2081-
size_t end = t.find('>', 2);
2080+
size_t begin = t[2] == 'P' ? 4 : 3;
2081+
size_t end = t.find('>', begin);
20822082
if (end == absl::string_view::npos) {
2083-
if (!IsValidUTF8(*s, status_))
2083+
if (!IsValidUTF8(t, status_))
20842084
return false;
20852085
status_->set_code(kRegexpBadNamedCapture);
2086-
status_->set_error_arg(*s);
2086+
status_->set_error_arg(t);
20872087
return false;
20882088
}
20892089

2090-
// t is "P<name>...", t[end] == '>'
2091-
absl::string_view capture(t.data()-2, end+3); // "(?P<name>"
2092-
absl::string_view name(t.data()+2, end-2); // "name"
2090+
absl::string_view capture(t.data(), end+1);
2091+
absl::string_view name(t.data()+begin, end-begin);
20932092
if (!IsValidUTF8(name, status_))
20942093
return false;
20952094
if (!IsValidCaptureName(name)) {
@@ -2103,11 +2102,12 @@ bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
21032102
return false;
21042103
}
21052104

2106-
s->remove_prefix(
2107-
static_cast<size_t>(capture.data() + capture.size() - s->data()));
2105+
s->remove_prefix(capture.size());
21082106
return true;
21092107
}
21102108

2109+
t.remove_prefix(2); // "(?"
2110+
21112111
bool negated = false;
21122112
bool sawflags = false;
21132113
int nflags = flags_;

re2/regexp.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,13 @@ static bool TopEqual(Regexp* a, Regexp* b) {
400400
a->max() == b->max();
401401

402402
case kRegexpCapture:
403-
return a->cap() == b->cap() && a->name() == b->name();
403+
if (a->name() == NULL || b->name() == NULL) {
404+
// One pointer is null, so the other pointer should also be null.
405+
return a->cap() == b->cap() && a->name() == b->name();
406+
} else {
407+
// Neither pointer is null, so compare the pointees for equality.
408+
return a->cap() == b->cap() && *a->name() == *b->name();
409+
}
404410

405411
case kRegexpHaveMatch:
406412
return a->match_id() == b->match_id();

re2/testing/parse_test.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,8 @@ static Test tests[] = {
166166
// Test named captures
167167
{ "(?P<name>a)", "cap{name:lit{a}}" },
168168
{ "(?P<中文>a)", "cap{中文:lit{a}}" },
169+
{ "(?<name>a)", "cap{name:lit{a}}" },
170+
{ "(?<中文>a)", "cap{中文:lit{a}}" },
169171

170172
// Case-folded literals
171173
{ "[Aa]", "litfold{a}" },
@@ -396,6 +398,11 @@ const char* badtests[] = {
396398
"(?P<name",
397399
"(?P<x y>a)",
398400
"(?P<>a)",
401+
"(?<name>a",
402+
"(?<name>",
403+
"(?<name",
404+
"(?<x y>a)",
405+
"(?<>a)",
399406
"[a-Z]",
400407
"(?i)[a-Z]",
401408
"a{100000}",
@@ -416,6 +423,7 @@ const char* only_perl[] = {
416423
"\\Q\\\\\\\\\\E",
417424
"(?:a)",
418425
"(?P<name>a)",
426+
"(?<name>a)",
419427
};
420428

421429
// Valid in POSIX, bad in Perl.
@@ -505,6 +513,16 @@ TEST(NamedCaptures, ErrorArgs) {
505513
EXPECT_TRUE(re == NULL);
506514
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
507515
EXPECT_EQ(status.error_arg(), "(?P<space bar>");
516+
517+
re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
518+
EXPECT_TRUE(re == NULL);
519+
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
520+
EXPECT_EQ(status.error_arg(), "(?<name");
521+
522+
re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
523+
EXPECT_TRUE(re == NULL);
524+
EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
525+
EXPECT_EQ(status.error_arg(), "(?<space bar>");
508526
}
509527

510528
} // namespace re2

0 commit comments

Comments
 (0)