Kill regexp.h

Message ID 20150712195501.D8E5C14B9A@panix1.panix.com
State Committed
Headers

Commit Message

Zack Weinberg July 12, 2015, 7:27 p.m. UTC
  <regexp.h> (not to be confused with <regex.h>) is an obsolete and
frankly horrible regular expression-matching API.  It was part of SVID
but was withdrawn in Issue 5 (for reference, we're on Issue 7 now).
It doesn't do anything you can't do with <regex.h>, and using it
involves defining a bunch of macros before including the header.
Moreover, the code in regexp.h that uses those macros has been buggy
since its creation (in 1996) and no one has noticed, which indicates
to me that there are no users.  (Specifically, RETURN() is used in a
whole bunch of cases where it should have been ERROR().)

Therefore, this patch stubs out the header and demotes the
implementation to compatibility symbols.  I hope this is not too late
to squeeze into glibc 2.22.  (Note: there isn't any predefined macro
to put .bss symbols in a compatibility subsection.  Using
'attribute_data_compat_section' would make the libc on disk bigger,
which seems like a move in the wrong direction.  It's only three
pointers' worth of junk, anyway...)

The ABI checker does not appear to cover these symbols; I manually
tested the effect of the patch as follows:

# glibc 2.19 (Debian)
$ readelf --dyn-syms /lib/x86_64-linux-gnu/libc.so.6 | grep -E 'advance|step|loc[12s]'
   540: 00000000000e4da0   104 FUNC    WEAK   DEFAULT   12 step@@GLIBC_2.2.5
  1364: 00000000000e4e10    84 FUNC    WEAK   DEFAULT   12 advance@@GLIBC_2.2.5
  2051: 00000000003a85e8     8 OBJECT  GLOBAL DEFAULT   32 loc1@@GLIBC_2.2.5
  2054: 00000000003a85f0     8 OBJECT  GLOBAL DEFAULT   32 loc2@@GLIBC_2.2.5
  2198: 00000000003a85f8     8 OBJECT  GLOBAL DEFAULT   32 locs@@GLIBC_2.2.5

# patched libc
$ readelf --dyn-syms ./libc.so.6 | grep -E 'advance|step|loc[12s]'
   541: 000000000011eb80   104 FUNC    WEAK   DEFAULT   12 step@GLIBC_2.2.5
  1374: 000000000011ebf0    84 FUNC    WEAK   DEFAULT   12 advance@GLIBC_2.2.5
  2065: 00000000003a35e0     8 OBJECT  GLOBAL DEFAULT   32 loc1@GLIBC_2.2.5
  2068: 00000000003a35e8     8 OBJECT  GLOBAL DEFAULT   32 loc2@GLIBC_2.2.5
  2211: 00000000003a35d8     8 OBJECT  GLOBAL DEFAULT   32 locs@GLIBC_2.2.5

$ cat test.c
#include <inttypes.h>

extern char *loc1;
extern char *loc2;
extern char *locs;
extern int step();
extern int advance();

int main(void)
{
  return (int)((intptr_t)step + (intptr_t)advance + (intptr_t)&locs + (intptr_t)&loc1 + (intptr_t)&loc2);
}

$ gcc-5 test.c; echo $?
0
$ ./testrun.sh ./a.out; echo $?
136
$ gcc-5 -nostdlib -nostartfiles csu/crt1.o csu/crti.o `gcc-5 --print-file-name=crtbegin.o` test.c libc.so.6 libc_nonshared.a -lgcc `gcc-5 --print-file-name=crtend.o` csu/crtn.o
/tmp/ccC6lyqC.o: In function `main':
test.c:(.text+0x5): undefined reference to `locs'
test.c:(.text+0xc): undefined reference to `step'
test.c:(.text+0x13): undefined reference to `loc2'
test.c:(.text+0x1c): undefined reference to `advance'
test.c:(.text+0x23): undefined reference to `loc1'
collect2: error: ld returned 1 exit status

---

N.B. I believe there *is* a past-and-future-changes copyright
assignment on file for me for glibc, but it was filed long, long
ago, if I need to do new paperwork that's OK.

zw

---
2015-07-12  Zack Weinberg  <zackw@panix.com>

	* misc/regexp.h: This interface is no longer supported.
	Remove all contents, leaving only an #error directive.
	* misc/regexp.c: Do not include regexp.h.
	(loc1, loc2, locs, step, advance): Demote to compatibility symbols.

 NEWS          |   4 ++
 misc/regexp.c |  37 +++++++----
 misc/regexp.h | 207 ++--------------------------------------------------------
 3 files changed, 34 insertions(+), 214 deletions(-)
  

Comments

Zack Weinberg July 12, 2015, 7:27 p.m. UTC | #1
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA256

On 08/05/2015 10:40 PM, Mike Frysinger wrote:
> On 12 Jul 2015 15:27, Zack Weinberg wrote:
>> I posted this a couple weeks ago, but now that the branch has been
>> cut, `SHLIB_COMPAT(GLIBC_2_0, GLIBC_2_23)` compiles, so I've actually
>> tested it. ;-)  Also, I merged in the corrections to NEWS and the
>> comment at the top of regexp.h, which I posted separately for the
>> 2.22 branch.  And hopefully it will not get mangled this time.
>
> i merged the NEWS/comment correction by itself to keep the
> master/branch in sync, so you'll have to rebase.

OK, rebased patch follows.  Thanks for the quick turnaround.

zw

	* misc/regexp.h: This interface is no longer supported.
	Remove all contents, leaving only an #error directive.
	* misc/regexp.c (loc1, loc2, locs, step, advance):
	Demote to compatibility symbols.
- ---
 NEWS          |   6 +-
 misc/regexp.c |  29 +++++++--
 misc/regexp.h | 203 +---------------------------------------------------------
 3 files changed, 30 insertions(+), 208 deletions(-)

diff --git a/NEWS b/NEWS
index e3c2351..794a515 100644
- --- a/NEWS
+++ b/NEWS
@@ -8,7 +8,11 @@ using `glibc' in the "product" field.
 Version 2.23
 
 * The following bugs are resolved with this release:
- -  16519, 18265, 18525, 18647, 18661.
+
+  16519, 18265, 18525, 18647, 18661, 18681.
+
+* The obsolete header <regexp.h> has been removed.  Programs that require
+  this header must be updated to use <regex.h> instead.
 
 Version 2.22
 
diff --git a/misc/regexp.c b/misc/regexp.c
index ee7d572..ef5e18b 100644
- --- a/misc/regexp.c
+++ b/misc/regexp.c
@@ -1,4 +1,4 @@
- -/* Define function and variables for the obsolete <regexp.h> interface.
+/* Compatibility symbols for the obsolete <regexp.h> interface.
    Copyright (C) 1996-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -17,17 +17,27 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
- -/* We don't include regexp.h here because of the macros it requires, and
- -   because it now contains an unconditional #warning.  */
+/* regexp.h now contains only an #error directive, so it cannot be
+   used in this file.
+
+   The function that would produce an 'expbuf' to use as the second
+   argument to 'step' and 'advance' was defined only in regexp.h,
+   as its definition depended on macros defined by the user.  */
 
 #include <regex.h>
+#include <shlib-compat.h>
+
+#if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_23)
 
 /* Define the variables used for the interface.  */
 char *loc1;
 char *loc2;
+compat_symbol (libc, loc1, loc1, GLIBC_2_0);
+compat_symbol (libc, loc2, loc2, GLIBC_2_0);
 
 /* Although we do not support the use we define this variable as well.  */
 char *locs;
+compat_symbol (libc, locs, locs, GLIBC_2_0);
 
 
 /* Find the next match in STRING.  The compiled regular expression is
@@ -35,7 +45,8 @@ char *locs;
    first character matched and `loc2' points to the next unmatched
    character.  */
 int
- -__step (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+step (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -50,14 +61,15 @@ __step (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
- -weak_alias (__step, step)
+compat_symbol (libc, step, step, GLIBC_2_0);
 
 
 /* Match the beginning of STRING with the compiled regular expression
    in EXPBUF.  If the match is successful `loc2' will contain the
    position of the first unmatched character.  */
 int
- -__advance (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+advance (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -74,4 +86,7 @@ __advance (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
- -weak_alias (__advance, advance)
+compat_symbol (libc, advance, advance, GLIBC_2_0);
+
+
+#endif /* SHLIB_COMPAT (2.0, 2.23) */
diff --git a/misc/regexp.h b/misc/regexp.h
index 42394f7..9f5c413 100644
- --- a/misc/regexp.h
+++ b/misc/regexp.h
@@ -25,206 +25,9 @@
    were encouraged to use <regex.h> instead.  It was officially
    withdrawn from the standard in Issue 6 (aka POSIX.1-2001).
 
- -   This header is provided only for backward compatibility.
- -   It will be removed in the next release of the GNU C Library.
- -   New code should use <regex.h> instead.  */
+   The GNU C Library provided this header through version 2.22. */
 
- -#warning "<regexp.h> will be removed in the next release of the GNU C Library."
- -#warning "Please update your code to use <regex.h> instead (no trailing 'p')."
- -
- -#include <features.h>
- -#include <alloca.h>
- -#include <regex.h>
- -#include <stdlib.h>
- -#include <string.h>
- -
- -/* The implementation provided here emulates the needed functionality
- -   by mapping to the POSIX regular expression matcher.  The interface
- -   for the here included function is weird (this really is a harmless
- -   word).
- -
- -   The user has to provide six macros before this header file can be
- -   included:
- -
- -   INIT		Declarations vor variables which can be used by the
- -		other macros.
- -
- -   GETC()	Return the value of the next character in the regular
- -		expression pattern.  Successive calls should return
- -		successive characters.
- -
- -   PEEKC()	Return the value of the next character in the regular
- -		expression pattern.  Immediately successive calls to
- -		PEEKC() should return the same character which should
- -		also be the next character returned by GETC().
- -
- -   UNGETC(c)	Cause `c' to be returned by the next call to GETC() and
- -		PEEKC().
- -
- -   RETURN(ptr)	Used for normal exit of the `compile' function.  `ptr'
- -		is a pointer to the character after the last character of
- -		the compiled regular expression.
- -
- -   ERROR(val)	Used for abnormal return from `compile'.  `val' is the
- -		error number.  The error codes are:
- -		11	Range endpoint too large.
- -		16	Bad number.
- -		25	\digit out of range.
- -		36	Illegal or missing delimiter.
- -		41	No remembered search string.
- -		42	\( \) imbalance.
- -		43	Too many \(.
- -		44	More tan two numbers given in \{ \}.
- -		45	} expected after \.
- -		46	First number exceeds second in \{ \}.
- -		49	[ ] imbalance.
- -		50	Regular expression overflow.
- -
- -  */
- -
- -__BEGIN_DECLS
- -
- -/* Interface variables.  They contain the results of the successful
- -   calls to `setp' and `advance'.  */
- -extern char *loc1;
- -extern char *loc2;
- -
- -/* The use of this variable in the `advance' function is not
- -   supported.  */
- -extern char *locs;
- -
- -
- -#ifndef __DO_NOT_DEFINE_COMPILE
- -/* Get and compile the user supplied pattern up to end of line or
- -   string or until EOF is seen, whatever happens first.  The result is
- -   placed in the buffer starting at EXPBUF and delimited by ENDBUF.
- -
- -   This function cannot be defined in the libc itself since it depends
- -   on the macros.  */
- -char *
- -compile (char *__restrict instring, char *__restrict expbuf,
- -	 const char *__restrict endbuf, int eof)
- -{
- -  char *__input_buffer = NULL;
- -  size_t __input_size = 0;
- -  size_t __current_size = 0;
- -  int __ch;
- -  int __error;
- -  INIT
- -
- -  /* Align the expression buffer according to the needs for an object
- -     of type `regex_t'.  Then check for minimum size of the buffer for
- -     the compiled regular expression.  */
- -  regex_t *__expr_ptr;
- -# if defined __GNUC__ && __GNUC__ >= 2
- -  const size_t __req = __alignof__ (regex_t *);
- -# else
- -  /* How shall we find out?  We simply guess it and can change it is
- -     this really proofs to be wrong.  */
- -  const size_t __req = 8;
- -# endif
- -  expbuf += __req;
- -  expbuf -= (expbuf - ((char *) 0)) % __req;
- -  if (endbuf < expbuf + sizeof (regex_t))
- -    {
- -      ERROR (50);
- -    }
- -  __expr_ptr = (regex_t *) expbuf;
- -  /* The remaining space in the buffer can be used for the compiled
- -     pattern.  */
- -  __expr_ptr->__REPB_PREFIX (buffer) = expbuf + sizeof (regex_t);
- -  __expr_ptr->__REPB_PREFIX (allocated)
- -    = endbuf - (char *) __expr_ptr->__REPB_PREFIX (buffer);
- -
- -  while ((__ch = (GETC ())) != eof)
- -    {
- -      if (__ch == '\0' || __ch == '\n')
- -	{
- -	  UNGETC (__ch);
- -	  break;
- -	}
- -
- -      if (__current_size + 1 >= __input_size)
- -	{
- -	  size_t __new_size = __input_size ? 2 * __input_size : 128;
- -	  char *__new_room = (char *) alloca (__new_size);
- -	  /* See whether we can use the old buffer.  */
- -	  if (__new_room + __new_size == __input_buffer)
- -	    {
- -	      __input_size += __new_size;
- -	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
- -					       __current_size);
- -	    }
- -	  else if (__input_buffer + __input_size == __new_room)
- -	    __input_size += __new_size;
- -	  else
- -	    {
- -	      __input_size = __new_size;
- -	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
- -						__current_size);
- -	    }
- -	}
- -      __input_buffer[__current_size++] = __ch;
- -    }
- -  if (__current_size)
- -    __input_buffer[__current_size++] = '\0';
- -  else
- -    __input_buffer = "";
- -
- -  /* Now compile the pattern.  */
- -  __error = regcomp (__expr_ptr, __input_buffer, REG_NEWLINE);
- -  if (__error != 0)
- -    /* Oh well, we have to translate POSIX error codes.  */
- -    switch (__error)
- -      {
- -      case REG_BADPAT:
- -      case REG_ECOLLATE:
- -      case REG_ECTYPE:
- -      case REG_EESCAPE:
- -      case REG_BADRPT:
- -      case REG_EEND:
- -      case REG_ERPAREN:
- -      default:
- -	/* There is no matching error code.  */
- -	ERROR (36);
- -      case REG_ESUBREG:
- -	ERROR (25);
- -      case REG_EBRACK:
- -	ERROR (49);
- -      case REG_EPAREN:
- -	ERROR (42);
- -      case REG_EBRACE:
- -	ERROR (44);
- -      case REG_BADBR:
- -	ERROR (46);
- -      case REG_ERANGE:
- -	ERROR (11);
- -      case REG_ESPACE:
- -      case REG_ESIZE:
- -	ERROR (50);
- -      }
- -
- -  /* Everything is ok.  */
- -  RETURN ((char *) (__expr_ptr->__REPB_PREFIX (buffer)
- -		    + __expr_ptr->__REPB_PREFIX (used)));
- -}
- -#endif
- -
- -
- -/* Find the next match in STRING.  The compiled regular expression is
- -   found in the buffer starting at EXPBUF.  `loc1' will return the
- -   first character matched and `loc2' points to the next unmatched
- -   character.  */
- -extern int step (const char *__restrict __string,
- -		 const char *__restrict __expbuf) __THROW;
- -
- -/* Match the beginning of STRING with the compiled regular expression
- -   in EXPBUF.  If the match is successful `loc2' will contain the
- -   position of the first unmatched character.  */
- -extern int advance (const char *__restrict __string,
- -		    const char *__restrict __expbuf) __THROW;
- -
- -
- -__END_DECLS
+#error "The GNU C Library no longer implements <regexp.h>."
+#error "Please update your code to use <regex.h> instead (no trailing 'p')."
 
 #endif /* regexp.h */
- -- 
2.5.0

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v2

iQIcBAEBCAAGBQJVxAa6AAoJEJH8wytnaapk+YIQAJOyW9MRAP8PSx+eeNegYZSF
wqfUdtyYefjLmJOQEhk82OcFh1Xwrg1wXnoVJOnOg29iipMa1ebNcAFvDQvGYCW4
ITLrCrf3ISTltxkaJpGkUIU1TSIth2+yDdBca1BeNOqPnr3F9qjNlGSxXJdlvqlx
uByj2DiczNR2Yyy+S7HFxKEU96m0DBYGlS8F75oN7lHaPefWR07uAl/5JeBeahiK
lznYDwxlWyAtUhCiVNE/WA8vB45ib4CA7RRlM/ASOlg+WiXfex/ep3Bz1sw/ciMm
caTr1bVVNqYdKuSscvLa0FalcVqfUXFCTVzt7Oiud/6otDOTr63guc41ZsVrJQ2U
GrzpF9rEz3vY/AWqwSWeShtHiDOW/mTEDYaA18V6aAy4jg0aaW5NWrgSb4p13hcN
8sk9Ku4eP5WTmGv7b7y87QeAlsBxrWdzbJ3P212nZ6yejZCUOWXJNuEmYoByZUiM
gUn3Kwbs7E7v9Jrs2x6Q92ZQUtp4UNsNMBn/XGbdV0C67NFnrh3ZS2uaqW15blYl
jPgiPXD6jULjSe1cJWcpsZiDn9LxhROdhOcsmLAN/wQXRyRJF/1Ol2ZxsZWpqVja
7qZMcg6/UbW8xSYv5vKYMFohgYaILRuLOTtgPpNM6Mf+nU+Dv1qMZhjJX7ZiA+6y
r994Y8TJTopzmVYEHEHF
=fK7E
-----END PGP SIGNATURE-----
  
Andreas Schwab July 12, 2015, 8:46 p.m. UTC | #2
Zack Weinberg <zackw@panix.com> writes:

> The ABI checker does not appear to cover these symbols;

What do you mean with that?

Andreas.
  
Zack Weinberg July 12, 2015, 9:57 p.m. UTC | #3
On Sun, Jul 12, 2015 at 4:46 PM, Andreas Schwab <schwab@linux-m68k.org> wrote:
> Zack Weinberg <zackw@panix.com> writes:
>> The ABI checker does not appear to cover these symbols;
>
> What do you mean with that?

I mean that neither `make check-abi` nor the tester described at
https://sourceware.org/glibc/wiki/Testing/ABI_checker found a problem
with this patch, however, the latter specifically cannot handle
regexp.h (because of the macros) so I was uncertain whether the
relevant symbols had actually been tested.  I see now that they *are*
in the .abilist files, so hopefully that's good enough.

zw
  
Kalle Olavi Niemitalo July 13, 2015, 5:35 a.m. UTC | #4
Zack Weinberg <zackw@panix.com> writes:

> Moreover, the code in regexp.h that uses those macros has been buggy
> since its creation (in 1996) and no one has noticed, which indicates
> to me that there are no users.  (Specifically, RETURN() is used in a
> whole bunch of cases where it should have been ERROR().)

Also, it allocates new memory for the compiled regexp, instead of
using the memory that the caller provided for that purpose.
http://bugs.debian.org/517625
  
Carlos O'Donell July 14, 2015, 9:21 p.m. UTC | #5
On 07/12/2015 03:27 PM, Zack Weinberg wrote:
> <regexp.h> (not to be confused with <regex.h>) is an obsolete and
> frankly horrible regular expression-matching API.  It was part of SVID
> but was withdrawn in Issue 5 (for reference, we're on Issue 7 now).
> It doesn't do anything you can't do with <regex.h>, and using it
> involves defining a bunch of macros before including the header.
> Moreover, the code in regexp.h that uses those macros has been buggy
> since its creation (in 1996) and no one has noticed, which indicates
> to me that there are no users.  (Specifically, RETURN() is used in a
> whole bunch of cases where it should have been ERROR().)
> 
> Therefore, this patch stubs out the header and demotes the
> implementation to compatibility symbols.  I hope this is not too late
> to squeeze into glibc 2.22.  (Note: there isn't any predefined macro
> to put .bss symbols in a compatibility subsection.  Using
> 'attribute_data_compat_section' would make the libc on disk bigger,
> which seems like a move in the wrong direction.  It's only three
> pointers' worth of junk, anyway...)

Too late for 2.22. You posted on the 12th which was technically frozen.
Please keep discussing for 2.23.

> The ABI checker does not appear to cover these symbols; I manually
> tested the effect of the patch as follows:

Interesting. I would have expected the ABI symbol checker to catch these
symbols... but I haven't looked yet.

Do you have any explanation for that? A manual test is not exactly what
we want to see going forward.

> # glibc 2.19 (Debian)
> $ readelf --dyn-syms /lib/x86_64-linux-gnu/libc.so.6 | grep -E 'advance|step|loc[12s]'
>    540: 00000000000e4da0   104 FUNC    WEAK   DEFAULT   12 step@@GLIBC_2.2.5
>   1364: 00000000000e4e10    84 FUNC    WEAK   DEFAULT   12 advance@@GLIBC_2.2.5
>   2051: 00000000003a85e8     8 OBJECT  GLOBAL DEFAULT   32 loc1@@GLIBC_2.2.5
>   2054: 00000000003a85f0     8 OBJECT  GLOBAL DEFAULT   32 loc2@@GLIBC_2.2.5
>   2198: 00000000003a85f8     8 OBJECT  GLOBAL DEFAULT   32 locs@@GLIBC_2.2.5
> 
> # patched libc
> $ readelf --dyn-syms ./libc.so.6 | grep -E 'advance|step|loc[12s]'
>    541: 000000000011eb80   104 FUNC    WEAK   DEFAULT   12 step@GLIBC_2.2.5
>   1374: 000000000011ebf0    84 FUNC    WEAK   DEFAULT   12 advance@GLIBC_2.2.5
>   2065: 00000000003a35e0     8 OBJECT  GLOBAL DEFAULT   32 loc1@GLIBC_2.2.5
>   2068: 00000000003a35e8     8 OBJECT  GLOBAL DEFAULT   32 loc2@GLIBC_2.2.5
>   2211: 00000000003a35d8     8 OBJECT  GLOBAL DEFAULT   32 locs@GLIBC_2.2.5
> 
> $ cat test.c
> #include <inttypes.h>
> 
> extern char *loc1;
> extern char *loc2;
> extern char *locs;
> extern int step();
> extern int advance();
> 
> int main(void)
> {
>   return (int)((intptr_t)step + (intptr_t)advance + (intptr_t)&locs + (intptr_t)&loc1 + (intptr_t)&loc2);
> }
> 
> $ gcc-5 test.c; echo $?
> 0
> $ ./testrun.sh ./a.out; echo $?
> 136
> $ gcc-5 -nostdlib -nostartfiles csu/crt1.o csu/crti.o `gcc-5 --print-file-name=crtbegin.o` test.c libc.so.6 libc_nonshared.a -lgcc `gcc-5 --print-file-name=crtend.o` csu/crtn.o
> /tmp/ccC6lyqC.o: In function `main':
> test.c:(.text+0x5): undefined reference to `locs'
> test.c:(.text+0xc): undefined reference to `step'
> test.c:(.text+0x13): undefined reference to `loc2'
> test.c:(.text+0x1c): undefined reference to `advance'
> test.c:(.text+0x23): undefined reference to `loc1'
> collect2: error: ld returned 1 exit status
> 
> ---
> 
> N.B. I believe there *is* a past-and-future-changes copyright
> assignment on file for me for glibc, but it was filed long, long
> ago, if I need to do new paperwork that's OK.

The problem is that your past-and-futures-changes were with
zack@rabi.phys.columbia.edu, and we don't know what your present
employer might say about the copyright of your work. Therefore
we do need new paperwork either personal ones for you @panix.com,
or corporate coverage.

At a high level your patch looks OK, it makes sense to deprecate
these interfaces, but I think we should to do this in two stages.
Add warnings and then remove.

Not to mention we need a test to make sure the symbols are at the
versions they should be.

Cheers,
Carlos.
  
Zack Weinberg July 15, 2015, 12:20 a.m. UTC | #6
On Tue, Jul 14, 2015 at 5:21 PM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 07/12/2015 03:27 PM, Zack Weinberg wrote:
>> ... this patch stubs out the header and demotes the
>> implementation to compatibility symbols.  I hope this is not too late
>> to squeeze into glibc 2.22.
>
> Too late for 2.22. You posted on the 12th which was technically frozen.
> Please keep discussing for 2.23.

OK. I won't be able to submit a revised patch till after 2.23 opens
due to the use of SHLIB_COMPAT.

>> The ABI checker does not appear to cover these symbols; I manually
>> tested the effect of the patch as follows:
>
> Interesting. I would have expected the ABI symbol checker to catch these
> symbols... but I haven't looked yet.

I was mistaken, the .abilist files *do* cover these symbols.  (The
tool described at
http://ispras.linuxbase.org/index.php/ABI_compliance_checker cannot
process the *header*, which is what confused me.)

>> N.B. I believe there *is* a past-and-future-changes copyright
>> assignment on file for me for glibc, but it was filed long, long
>> ago, if I need to do new paperwork that's OK.
>
> The problem is that your past-and-futures-changes were with
> zack@rabi.phys.columbia.edu, and we don't know what your present
> employer might say about the copyright of your work. Therefore
> we do need new paperwork either personal ones for you @panix.com,
> or corporate coverage.

My current employer is CMU and I believe they make no copyright claim
on work done on my own time with no use of university resources.  I
will get the ball rolling on a new personal copyright assignment.

> At a high level your patch looks OK, it makes sense to deprecate
> these interfaces, but I think we should to do this in two stages.
> Add warnings and then remove.

Hm. If we do that then I would feel obliged to fix the bugs in the
header in phase one. I'm not sure that's worth doing...

zw
  
Mike Frysinger July 15, 2015, 4:01 a.m. UTC | #7
On 12 Jul 2015 15:27, Zack Weinberg wrote:
> Therefore, this patch stubs out the header and demotes the
> implementation to compatibility symbols.

our current workflow says that all user visible changes (like this) require a 
bugzilla bug to be created for reference.  please do that and update the various
pieces with that BZ #.

> +#if SHLIB_COMPAT(libc, GLIBC_2_0, GLIBC_2_22)

should be a space before the (

> -/* Define the variables used for the interface.  */
>  char *loc1;
>  char *loc2;
> -
> -/* Although we do not support the use we define this variable as well.  */
>  char *locs;

should probably leave the comments alone

> +compat_symbol(libc, loc1, loc1, GLIBC_2_0);
> +compat_symbol(libc, loc2, loc2, GLIBC_2_0);
> +compat_symbol(libc, locs, locs, GLIBC_2_0);

should be a space before the (

> +#error "GNU libc no longer implements <regexp.h>."
> +#error "Revise your code to use <regex.h> (no P)."

how about changing the parenthetical aside to:
 ... (no trailing 'p').
-mike
  
Carlos O'Donell July 15, 2015, 1:12 p.m. UTC | #8
On 07/14/2015 08:20 PM, Zack Weinberg wrote:
>> At a high level your patch looks OK, it makes sense to deprecate
>> these interfaces, but I think we should to do this in two stages.
>> Add warnings and then remove.
> 
> Hm. If we do that then I would feel obliged to fix the bugs in the
> header in phase one. I'm not sure that's worth doing...

Worth is certainly in the eye of the beholder. How would you feel if
you were a user of this interface?

c.
  
Zack Weinberg July 15, 2015, 1:14 p.m. UTC | #9
On 07/15/2015 12:01 AM, Mike Frysinger wrote:
> On 12 Jul 2015 15:27, Zack Weinberg wrote:
>> Therefore, this patch stubs out the header and demotes the
>> implementation to compatibility symbols.
> 
> our current workflow says that all user visible changes (like this) require a 
> bugzilla bug to be created for reference.  please do that and update the various
> pieces with that BZ #.

OK, filed https://sourceware.org/bugzilla/show_bug.cgi?id=18681 .

>> +#if SHLIB_COMPAT(libc, GLIBC_2_0, GLIBC_2_22)
> 
> should be a space before the (

Thanks, I haven't coded to the GNU style standards in a long time.

>> -/* Define the variables used for the interface.  */
>>  char *loc1;
>>  char *loc2;
>> -
>> -/* Although we do not support the use we define this variable as well.  */
>>  char *locs;
> 
> should probably leave the comments alone

yeah, ok

>> +#error "GNU libc no longer implements <regexp.h>."
>> +#error "Revise your code to use <regex.h> (no P)."
> 
> how about changing the parenthetical aside to:
>  ... (no trailing 'p').

ok
  
Mike Frysinger Aug. 6, 2015, 2:37 a.m. UTC | #10
On 05 Aug 2015 10:58, Zack Weinberg wrote:
> In the "Kill regexp.h" thread, Joseph dug up more accurate information
> about exactly which editions of the Single Unix Standard included and
> deprecated this header.

pushed, thanks!
-mike
  
Mike Frysinger Aug. 6, 2015, 2:40 a.m. UTC | #11
On 12 Jul 2015 15:27, Zack Weinberg wrote:
> I posted this a couple weeks ago, but now that the branch has been
> cut, `SHLIB_COMPAT(GLIBC_2_0, GLIBC_2_23)` compiles, so I've actually
> tested it. ;-)  Also, I merged in the corrections to NEWS and the
> comment at the top of regexp.h, which I posted separately for the
> 2.22 branch.  And hopefully it will not get mangled this time.

i merged the NEWS/comment correction by itself to keep the master/branch in
sync, so you'll have to rebase.

> I can see two ways to proceed with this patch.  First is just to go
> ahead and land it now, and hope that if it breaks someone, they will
> tell us about it during the 2.23 development cycle.  Second is to
> attempt to search for code it will break, e.g. with a Debian archive
> rebuild or a broad-spectrum code-search service.  I did try the latter
> but was not able to find one that would distinguish *this* <regexp.h>
> from one provided by the application, and there are lots of those.

i'm fine with deleting the header now.
-mike
  
Mike Frysinger Aug. 7, 2015, 2:14 a.m. UTC | #12
i'm ok with this, but i think Carlos wanted to look at it first
-mike
  

Patch

diff --git a/NEWS b/NEWS
index 2cfc43e..c00dba1 100644
--- a/NEWS
+++ b/NEWS
@@ -71,6 +71,10 @@  Version 2.22
   compliance. The new implementation fixes the following long-standing
   issues: BZ#6544, BZ#11216, BZ#12836, BZ#13151, BZ#13152, and BZ#14292. The
   old implementation is still present for use be by existing binaries.
+
+* The obsolete header <regexp.h> (from SUSv2) has been replaced with a
+  stub.  Programs that require this header must be updated to use
+  <regex.h> instead.
 
 Version 2.21
 
diff --git a/misc/regexp.c b/misc/regexp.c
index 3b83203..d35888f 100644
--- a/misc/regexp.c
+++ b/misc/regexp.c
@@ -1,4 +1,4 @@ 
-/* Define function and variables for the obsolete <regexp.h> interface.
+/* Compatibility symbols for the obsolete <regexp.h> interface.
    Copyright (C) 1996-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
@@ -17,24 +17,32 @@ 
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define __DO_NOT_DEFINE_COMPILE
-#include <regexp.h>
+/* regexp.h now contains only an #error directive, so it cannot be
+   used in this file.
+
+   The function that would produce an 'expbuf' to use as the second
+   argument to 'step' and 'advance' was defined only in regexp.h,
+   as its definition depended on macros defined by the user.  */
+
+#include <regex.h>
+#include <shlib-compat.h>
+
+#if SHLIB_COMPAT(libc, GLIBC_2_0, GLIBC_2_22)
 
-/* Define the variables used for the interface.  */
 char *loc1;
 char *loc2;
-
-/* Although we do not support the use we define this variable as well.  */
 char *locs;
-
+compat_symbol(libc, loc1, loc1, GLIBC_2_0);
+compat_symbol(libc, loc2, loc2, GLIBC_2_0);
+compat_symbol(libc, locs, locs, GLIBC_2_0);
 
 /* Find the next match in STRING.  The compiled regular expression is
    found in the buffer starting at EXPBUF.  `loc1' will return the
    first character matched and `loc2' points to the next unmatched
    character.  */
-extern int __step (const char *string, const char *expbuf);
 int
-__step (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+step (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -49,15 +57,14 @@  __step (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
-weak_alias (__step, step)
-
+compat_symbol (libc, step, step, GLIBC_2_0);
 
 /* Match the beginning of STRING with the compiled regular expression
    in EXPBUF.  If the match is successful `loc2' will contain the
    position of the first unmatched character.  */
-extern int __advance (const char *string, const char *expbuf);
 int
-__advance (const char *string, const char *expbuf)
+weak_function attribute_compat_text_section
+advance (const char *string, const char *expbuf)
 {
   regmatch_t match;	/* We only need info about the full match.  */
 
@@ -74,4 +81,6 @@  __advance (const char *string, const char *expbuf)
   loc2 = (char *) string + match.rm_eo;
   return 1;
 }
-weak_alias (__advance, advance)
+compat_symbol (libc, advance, advance, GLIBC_2_0);
+
+#endif /* SHLIB_COMPAT(2.0, 2.22) */
diff --git a/misc/regexp.h b/misc/regexp.h
index 3fc0bc5..43c6e10 100644
--- a/misc/regexp.h
+++ b/misc/regexp.h
@@ -19,208 +19,15 @@ 
 #ifndef _REGEXP_H
 #define _REGEXP_H	1
 
-/* The contents of this header file was first standardized in X/Open
+/* The contents of this header file were first standardized in X/Open
    System Interface and Headers Issue 2, originally coming from SysV.
-   In issue 4, version 2, it is marked as TO BE WITDRAWN, and it has
-   been withdrawn in SUSv3.
+   In issue 4, version 2, it was marked as TO BE WITHDRAWN, and it was
+   duly withdrawn in issue 5.
 
-   This code shouldn't be used in any newly written code.  It is
-   included only for compatibility reasons.  Use the POSIX definition
-   in <regex.h> for portable applications and a reasonable interface.  */
+   As of GNU libc 2.22, the interfaces in this header have been removed.
+   Use the <regex.h> interfaces instead. */
 
-#include <features.h>
-#include <alloca.h>
-#include <regex.h>
-#include <stdlib.h>
-#include <string.h>
-
-/* The implementation provided here emulates the needed functionality
-   by mapping to the POSIX regular expression matcher.  The interface
-   for the here included function is weird (this really is a harmless
-   word).
-
-   The user has to provide six macros before this header file can be
-   included:
-
-   INIT		Declarations vor variables which can be used by the
-		other macros.
-
-   GETC()	Return the value of the next character in the regular
-		expression pattern.  Successive calls should return
-		successive characters.
-
-   PEEKC()	Return the value of the next character in the regular
-		expression pattern.  Immediately successive calls to
-		PEEKC() should return the same character which should
-		also be the next character returned by GETC().
-
-   UNGETC(c)	Cause `c' to be returned by the next call to GETC() and
-		PEEKC().
-
-   RETURN(ptr)	Used for normal exit of the `compile' function.  `ptr'
-		is a pointer to the character after the last character of
-		the compiled regular expression.
-
-   ERROR(val)	Used for abnormal return from `compile'.  `val' is the
-		error number.  The error codes are:
-		11	Range endpoint too large.
-		16	Bad number.
-		25	\digit out of range.
-		36	Illegal or missing delimiter.
-		41	No remembered search string.
-		42	\( \) imbalance.
-		43	Too many \(.
-		44	More tan two numbers given in \{ \}.
-		45	} expected after \.
-		46	First number exceeds second in \{ \}.
-		49	[ ] imbalance.
-		50	Regular expression overflow.
-
-  */
-
-__BEGIN_DECLS
-
-/* Interface variables.  They contain the results of the successful
-   calls to `setp' and `advance'.  */
-extern char *loc1;
-extern char *loc2;
-
-/* The use of this variable in the `advance' function is not
-   supported.  */
-extern char *locs;
-
-
-#ifndef __DO_NOT_DEFINE_COMPILE
-/* Get and compile the user supplied pattern up to end of line or
-   string or until EOF is seen, whatever happens first.  The result is
-   placed in the buffer starting at EXPBUF and delimited by ENDBUF.
-
-   This function cannot be defined in the libc itself since it depends
-   on the macros.  */
-char *
-compile (char *__restrict instring, char *__restrict expbuf,
-	 const char *__restrict endbuf, int eof)
-{
-  char *__input_buffer = NULL;
-  size_t __input_size = 0;
-  size_t __current_size = 0;
-  int __ch;
-  int __error;
-  INIT
-
-  /* Align the expression buffer according to the needs for an object
-     of type `regex_t'.  Then check for minimum size of the buffer for
-     the compiled regular expression.  */
-  regex_t *__expr_ptr;
-# if defined __GNUC__ && __GNUC__ >= 2
-  const size_t __req = __alignof__ (regex_t *);
-# else
-  /* How shall we find out?  We simply guess it and can change it is
-     this really proofs to be wrong.  */
-  const size_t __req = 8;
-# endif
-  expbuf += __req;
-  expbuf -= (expbuf - ((char *) 0)) % __req;
-  if (endbuf < expbuf + sizeof (regex_t))
-    {
-      ERROR (50);
-    }
-  __expr_ptr = (regex_t *) expbuf;
-  /* The remaining space in the buffer can be used for the compiled
-     pattern.  */
-  __expr_ptr->__REPB_PREFIX (buffer) = expbuf + sizeof (regex_t);
-  __expr_ptr->__REPB_PREFIX (allocated)
-    = endbuf - (char *) __expr_ptr->__REPB_PREFIX (buffer);
-
-  while ((__ch = (GETC ())) != eof)
-    {
-      if (__ch == '\0' || __ch == '\n')
-	{
-	  UNGETC (__ch);
-	  break;
-	}
-
-      if (__current_size + 1 >= __input_size)
-	{
-	  size_t __new_size = __input_size ? 2 * __input_size : 128;
-	  char *__new_room = (char *) alloca (__new_size);
-	  /* See whether we can use the old buffer.  */
-	  if (__new_room + __new_size == __input_buffer)
-	    {
-	      __input_size += __new_size;
-	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
-					       __current_size);
-	    }
-	  else if (__input_buffer + __input_size == __new_room)
-	    __input_size += __new_size;
-	  else
-	    {
-	      __input_size = __new_size;
-	      __input_buffer = (char *) memcpy (__new_room, __input_buffer,
-						__current_size);
-	    }
-	}
-      __input_buffer[__current_size++] = __ch;
-    }
-  if (__current_size)
-    __input_buffer[__current_size++] = '\0';
-  else
-    __input_buffer = "";
-
-  /* Now compile the pattern.  */
-  __error = regcomp (__expr_ptr, __input_buffer, REG_NEWLINE);
-  if (__error != 0)
-    /* Oh well, we have to translate POSIX error codes.  */
-    switch (__error)
-      {
-      case REG_BADPAT:
-      case REG_ECOLLATE:
-      case REG_ECTYPE:
-      case REG_EESCAPE:
-      case REG_BADRPT:
-      case REG_EEND:
-      case REG_ERPAREN:
-      default:
-	/* There is no matching error code.  */
-	RETURN (36);
-      case REG_ESUBREG:
-	RETURN (25);
-      case REG_EBRACK:
-	RETURN (49);
-      case REG_EPAREN:
-	RETURN (42);
-      case REG_EBRACE:
-	RETURN (44);
-      case REG_BADBR:
-	RETURN (46);
-      case REG_ERANGE:
-	RETURN (11);
-      case REG_ESPACE:
-      case REG_ESIZE:
-	ERROR (50);
-      }
-
-  /* Everything is ok.  */
-  RETURN ((char *) (__expr_ptr->__REPB_PREFIX (buffer)
-		    + __expr_ptr->__REPB_PREFIX (used)));
-}
-#endif
-
-
-/* Find the next match in STRING.  The compiled regular expression is
-   found in the buffer starting at EXPBUF.  `loc1' will return the
-   first character matched and `loc2' points to the next unmatched
-   character.  */
-extern int step (const char *__restrict __string,
-		 const char *__restrict __expbuf) __THROW;
-
-/* Match the beginning of STRING with the compiled regular expression
-   in EXPBUF.  If the match is successful `loc2' will contain the
-   position of the first unmatched character.  */
-extern int advance (const char *__restrict __string,
-		    const char *__restrict __expbuf) __THROW;
-
-
-__END_DECLS
+#error "GNU libc no longer implements <regexp.h>."
+#error "Revise your code to use <regex.h> (no P)."
 
 #endif /* regexp.h */