mk/subst.mk: fix edge case in detection of identity substitutions

In a basic regular expression, a dollar-sign only means end-of-string if
it appears at the end of the pattern, or (at the choice of the
implementation) at the end of a \(...\) subexpression.

This affects the package converters/help2man that uses a regular
expression containing a dollar in a non-final position.  This regular
expression had not been detected as an identity substitution even though
it is one.
This commit is contained in:
rillig 2020-05-11 19:52:13 +00:00
parent b9f83bca93
commit 21aab909de
2 changed files with 31 additions and 7 deletions

View file

@ -1,5 +1,5 @@
#! /usr/bin/awk -f
# $NetBSD: subst-identity.awk,v 1.2 2020/05/06 06:14:56 rillig Exp $
# $NetBSD: subst-identity.awk,v 1.3 2020/05/11 19:52:14 rillig Exp $
#
# Tests whether a sed(1) command line consists of only identity substitutions
# like s,id,id,.
@ -9,13 +9,17 @@
# Returns the first character of the given regular expression,
# if it is a single-character regular expression.
function identity_char(s) {
function identity_char(s, sep, i) {
if (s ~ /^[\t -~]/ && s !~ /^[$&*.\[\\\]^]/)
return substr(s, 1, 1);
if (s ~ /^\\[$*.\[\]^]/)
return substr(s, 2, 1) "x";
if (s ~ /^\[[$*.]\]/)
return substr(s, 2, 1) "xx";
if (substr(s, 1, 1) == "$" && substr(s, 2, 1) != sep)
return substr(s, 1, 1);
if (substr(s, 1, 1) == "^" && i > 3)
return substr(s, 1, 1);
return "";
}
@ -29,7 +33,7 @@ function is_identity_subst(s, len, i, sep, pat_from, pat_to, ch, subst) {
i = 3;
pat_to = "";
while (i < len && substr(s, i, 1) != sep) {
ch = identity_char(substr(s, i));
ch = identity_char(substr(s, i), sep, i);
if (ch == "")
break;
pat_to = pat_to substr(ch, 1, 1);

View file

@ -1,5 +1,5 @@
#! /bin/sh
# $NetBSD: subst.sh,v 1.35 2020/05/11 19:17:22 rillig Exp $
# $NetBSD: subst.sh,v 1.36 2020/05/11 19:52:13 rillig Exp $
#
# Tests for mk/subst.mk.
#
@ -1219,9 +1219,29 @@ if test_case_begin "identity substitution implementation"; then
# See converters/help2man for an example.
assert_identity 'yes' -e 's,\$(var),$(var),'
# An unescaped dollar means end-of-line and cannot be part of an
# identity substitution. This may happen, but is clearly a typo.
assert_identity 'no' -e 's,$(var),$(var),'
# POSIX 2004 and 2018 both define in section "9.3.8 BRE Expression
# Anchoring" that a dollar-sign at the end of the string means
# end-of-string.
#
# A dollar-sign followed by \) may or may not be an anchor.
# In all other cases the dollar is an ordinary character.
assert_identity 'yes' -e 's,$(var),$(var),'
# Since this dollar-sign may or may not be an anchor, treat the
# whole regular expression as not-an-identity.
#
# Since a regular expression with a subexpression must contain
# \( and \), it does not count as an identity substitution anyway,
# which makes the implementation simple.
assert_identity 'no' -e 's,aaa\(aaa$\),aaa\(aaa$\),'
assert_identity 'yes' -e 's,$a,$a,'
assert_identity 'no' -e 's,a$,a$,'
# Same for the circumflex.
assert_identity 'yes' -e 's,a^,a^,'
assert_identity 'no' -e 's,^a,^a,'
assert_identity 'no' -e 's,\(^aaa\)aaa,\(^aaa\)aaa,'
test_case_end
fi