[RFC,v9,2/6] Implement alternative month names (bug 10871).

Message ID 742475879.1094767.1505817734249@poczta.nazwa.pl
State Superseded
Headers

Commit Message

Rafal Luzynski Sept. 19, 2017, 10:42 a.m. UTC
  Some languages (Slavic, Baltic, etc.) require a genitive case of the
month name when formatting a full date (with the day number) while
they require a nominative case when referring to the month standalone.
This requirement cannot be fulfilled without providing two forms for
each month name.  From now it is precised that nl_langinfo(MON_1)
series (up to MON_12) and strftime("%B") generate the month names in
the grammatical form used when the month forms part of a complete date.
If the grammatical form used when the month is named by itself is needed,
the new values nl_langinfo(ALTMON_1) (up to ALTMON_12) and
strftime("%OB") are supported.  This new feature is optional so the
languages which do not need it or do not yet provide the updated
locales simply do not use it and their behaviour is unchanged.

	[BZ #10871]
	* locale/C-time.c: Add alternative month names, define them as the
	same as mon explicitly.
	* locale/categories.def: alt_mon and wide-alt_mon added.
	* locale/langinfo.h: ALTMON_1 .. ALTMON_12 and similar contants
	defined.
	* locale/programs/ld-time.c: Alternative month names support
	added, they are a copy of mon if not specified explicitly.
	* locale/programs/locfile-kw.gperf: alt_mon defined.
	* locale/programs/locfile-token.h: tok_alt_mon defined.
	* localedata/tst-langinfo.c: Add tests for the new constants
	ALTMON_1 .. ALTMON_12.
	* time/strftime_l.c: %OB format for alternative month names
	added.
	* time/strptime_l.c: Alternative month names also recognized.
---
 ChangeLog                        | 18 +++++++++++++++
 locale/C-time.c                  | 28 ++++++++++++++++++++--
 locale/categories.def            |  2 ++
 locale/langinfo.h                | 50 ++++++++++++++++++++++++++++++++++++++--
 locale/programs/ld-time.c        | 21 +++++++++++++++++
 locale/programs/locfile-kw.gperf |  1 +
 locale/programs/locfile-token.h  |  1 +
 localedata/tst-langinfo.c        | 12 ++++++++++
 time/strftime_l.c                | 11 +++++++--
 time/strptime_l.c                | 24 +++++++++++++++++++
 10 files changed, 162 insertions(+), 6 deletions(-)

 # define HERE_AM_STR (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (AM_STR)].string)
@@ -402,6 +404,20 @@ __strptime_internal (const char *rp, const char *fmt,
struct tm *tmp,
 	      if (s.decided !=raw)
 		{
 		  trp = rp;
+#ifdef _LIBC
+		  /* First check the alt month.  */
+		  if (match_string (_NL_CURRENT (LC_TIME, ALTMON_1 + cnt), trp)
+		      && trp > rp_longest)
+		    {
+		      rp_longest = trp;
+		      cnt_longest = cnt;
+		      if (s.decided == not
+			  && strcmp (_NL_CURRENT (LC_TIME, ALTMON_1 + cnt),
+				     alt_month_name[cnt]))
+			decided_longest = loc;
+		    }
+		  trp = rp;
+#endif
 		  if (match_string (_NL_CURRENT (LC_TIME, MON_1 + cnt), trp)
 		      && trp > rp_longest)
 		    {
@@ -428,6 +444,10 @@ __strptime_internal (const char *rp, const char *fmt,
struct tm *tmp,
 	      if (s.decided != loc
 		  && (((trp = rp, match_string (month_name[cnt], trp))
 		       && trp > rp_longest)
+#ifdef _LIBC
+		      || ((trp = rp, match_string (alt_month_name[cnt], trp))
+			  && trp > rp_longest)
+#endif
 		      || ((trp = rp, match_string (ab_month_name[cnt], trp))
 			  && trp > rp_longest)))
 		{
@@ -1015,6 +1035,10 @@ __strptime_internal (const char *rp, const char *fmt,
struct tm *tmp,
 	case 'O':
 	  switch (*fmt++)
 	    {
+	    case 'B':
+	      /* Undo the increment and continue.  */
+	      fmt--;
+	      break;
 	    case 'd':
 	    case 'e':
 	      /* Match day of month using alternate numeric symbols.  */
  

Comments

Zack Weinberg Oct. 27, 2017, 4:29 p.m. UTC | #1
On Tue, Sep 19, 2017 at 6:42 AM, Rafal Luzynski
<digitalfreak@lingonborough.com> wrote:
> Some languages (Slavic, Baltic, etc.) require a genitive case of the
> month name when formatting a full date (with the day number) while
> they require a nominative case when referring to the month standalone.
> This requirement cannot be fulfilled without providing two forms for
> each month name.  From now it is precised that nl_langinfo(MON_1)
> series (up to MON_12) and strftime("%B") generate the month names in
> the grammatical form used when the month forms part of a complete date.
> If the grammatical form used when the month is named by itself is needed,
> the new values nl_langinfo(ALTMON_1) (up to ALTMON_12) and
> strftime("%OB") are supported.

I am reviewing this patch on the assumption that we have consensus for
the addition of this feature, with the semantics described above, and
without any provision for old binaries (that is, the strings produced
by %B will unconditionally change when locales that need two forms are
updated).  Last call for objections.

The code changes look good to me, except for a few small concerns
listed below.  Also, before landing I would like Joseph Myers (cc:ed)
to positively confirm that this patch introduces no new ISO C or POSIX
conformance issues, and will not tie our hands if this or a similar
feature is standardized in the future.

I do not see any new tests anywhere in this patch series, and I found
a bug in the support for %OB in strptime (see below).  To test this
feature we need at least one locale that actually uses it, so rather
than gate *these* patches on the addition of tests, I am asking you to
write comprehensive tests and include them with the first patch you
submit that adds altmon names to a locale. (This locale will have to
be one of the locales currently listed in localedata/Makefile as
available to tests (the test-input variable, I think); if none of them
need this feature, you may add one.)

>         [BZ #10871]
>         * locale/C-time.c: Add alternative month names, define them as the
>         same as mon explicitly.
>         * locale/categories.def: alt_mon and wide-alt_mon added.
>         * locale/langinfo.h: ALTMON_1 .. ALTMON_12 and similar contants
>         defined.
>         * locale/programs/ld-time.c: Alternative month names support
>         added, they are a copy of mon if not specified explicitly.
>         * locale/programs/locfile-kw.gperf: alt_mon defined.
>         * locale/programs/locfile-token.h: tok_alt_mon defined.
>         * localedata/tst-langinfo.c: Add tests for the new constants
>         ALTMON_1 .. ALTMON_12.
>         * time/strftime_l.c: %OB format for alternative month names
>         added.
>         * time/strptime_l.c: Alternative month names also recognized.

A note on writing ChangeLog entries: we really do want you to list
every single new symbol that's part of the public API, not an
abbreviated list -- that is:

        * locale/langinfo.h [__USE_GNU]: New public symbols ALTMON_1,
        ALTMON_2, ALTMON_3, ALTMON_4, ALTMON_5, ALTMON_6, ALTMON_7,
        ALTMON_8, ALTMON_9, ALTMON_10, ALTMON_11, ALTMON_12,
        _NL_WALTMON_1, _NL_WALTMON_2, _NL_WALTMON_3, _NL_WALTMON_4,
        _NL_WALTMON_5, _NL_WALTMON_6, _NL_WALTMON_7, _NL_WALTMON_8,
        _NL_WALTMON_9, _NL_WALTMON_10, _NL_WALTMON_11, _NL_WALTMON_12.

We want it spelled out like this because the ChangeLog exists to be
grepped.  The idea is that someone five years from now who is having a
weird problem involving _NL_WALTMON_9 should be able to search for
_NL_WALTMON_9 and find the commit where it was introduced.  If you
abbreviate the list, that won't work.

The official instructions for writing ChangeLogs
<https://www.gnu.org/prep/standards/html_node/Change-Logs.html> would
have you mention *every* new, changed, or deleted symbol for *every*
file where it appears, but I find that that bulks up the log entries
without adding much benefit.  It is enough, I think, to mention every
file, and (independently) every new, changed, or deleted symbol at
least once.  That's enough for the hypothetical future person to find
the commit, and then they're going to look at the diffs.

> diff --git a/ChangeLog b/ChangeLog
> index 3b8e6c5..b0636ff 100644
> --- a/ChangeLog
> +++ b/ChangeLog

Another minor procedural thing: don't include diffs to the file
ChangeLog in commits sent for review, because if a reviewer wants to
apply the patch and tinker with it, they don't want to have to clean
up the inevitable merge botch in the ChangeLog.

> @@ -402,6 +404,20 @@ __strptime_internal (const char *rp, const char *fmt,
> struct tm *tmp,
>               if (s.decided !=raw)
>                 {
>                   trp = rp;
> +#ifdef _LIBC
> +                 /* First check the alt month.  */

Here you are checking the alt-month name before the month name ...

> @@ -428,6 +444,10 @@ __strptime_internal (const char *rp, const char *fmt,
> struct tm *tmp,
>               if (s.decided != loc
>                   && (((trp = rp, match_string (month_name[cnt], trp))
>                        && trp > rp_longest)
> +#ifdef _LIBC
> +                     || ((trp = rp, match_string (alt_month_name[cnt], trp))
> +                         && trp > rp_longest)
> +#endif
>                       || ((trp = rp, match_string (ab_month_name[cnt], trp))
>                           && trp > rp_longest)))

... and here you are checking the alt-month name after the month name.
Please make this consistent one way or the other.  (I *think* the
order of checks would only ever matter if the alt-month name for month
X were the same as the month name for month Y, which would be a nasty
ambiguity in the relevant natural language -- but nasty ambiguities do
actually happen in natural languages, so we need to be careful.)

I wonder whether we actually need the #ifdef _LIBC here -- but that's
an independent question (is this code _really_ still shared with
gnulib, or have they diverged to the point where we should stop trying
to keep the files in sync?) and you are not on the hook to answer it.

> @@ -1015,6 +1035,10 @@ __strptime_internal (const char *rp, const char *fmt,
> struct tm *tmp,
>         case 'O':
>           switch (*fmt++)
>             {
> +           case 'B':
> +             /* Undo the increment and continue.  */
> +             fmt--;
> +             break;

This is subtly wrong, and I had to read a big chunk of the entire
function to understand what it was meant to do and how it doesn't
actually work.  Please change to

]         case 'O':
]           switch (*fmt++)
]             {
] +           case 'B':
] +             /* Match month name.  Reprocess as plain 'B'.  */
] +             fmt--;
] +             goto start_over;

You will also need to remove the `#ifndef _NL_CURRENT` wrapping the
definition of the `start_over` label, and change its comment:

] -#ifndef _NL_CURRENT
] -      /* We need this for handling the `E' modifier.  */
] +      /* In some cases, modifiers are handled by adjusting state and
] +         then restarting the switch statement below.  */
]     start_over:
] -#endif
]
]       /* Make back up of current processing pointer.  */

(Am I seriously telling you to use `goto`?  Yes, I am.  This is the
kind of abnormal control flow situation where a judicious `goto` is
_easier_ to read than the alternative - what you had was meant to do
the same thing, but it did it by cycling the `while (*fmt != '\0')`
loop, which meant that all of the processing _above_ the outer switch
statement was repeated, which was both wrong (note the unconditional
`++fmt` above "We discard strftime modifiers") and more fragile and
complicated to understand than "goto start_over", which requires the
reader only to search for the "start_over" label.)

zw
  
Joseph Myers Oct. 27, 2017, 5:03 p.m. UTC | #2
On Fri, 27 Oct 2017, Zack Weinberg wrote:

> The code changes look good to me, except for a few small concerns
> listed below.  Also, before landing I would like Joseph Myers (cc:ed)
> to positively confirm that this patch introduces no new ISO C or POSIX
> conformance issues, and will not tie our hands if this or a similar
> feature is standardized in the future.

I believe this patch properly handles namespace issues and avoids any 
problems with improperly changing enum values or making them improperly 
depend on feature test macros defined.  I believe it properly deals with 
allowing existing locales using existing POSIX features to continue to 
work by copying the month values as needed.

There should be a GCC bug filed, if there isn't one already, for allowing 
%OB / %Ob in strftime formats (with appropriate warnings in ISO C pedantic 
modes about being an extension).  Tests may need to disable errors for 
format warnings when built with a GCC version without that support.

> > @@ -1015,6 +1035,10 @@ __strptime_internal (const char *rp, const char *fmt,
> > struct tm *tmp,
> >         case 'O':
> >           switch (*fmt++)
> >             {
> > +           case 'B':
> > +             /* Undo the increment and continue.  */
> > +             fmt--;
> > +             break;
> 
> This is subtly wrong, and I had to read a big chunk of the entire
> function to understand what it was meant to do and how it doesn't
> actually work.  Please change to
> 
> ]         case 'O':
> ]           switch (*fmt++)
> ]             {
> ] +           case 'B':
> ] +             /* Match month name.  Reprocess as plain 'B'.  */
> ] +             fmt--;
> ] +             goto start_over;

Also, the POSIX proposal accepted for issue 8 has strptime %Ob and 
strptime %OB handled the same, despite that POSIX proposal not including 
strftime %Ob.  (This is only an argument for %Ob handling for strptime to 
go in this patch rather than the later one adding alternative abbreviated 
months, however.)
  
Zack Weinberg Oct. 27, 2017, 5:09 p.m. UTC | #3
On Fri, Oct 27, 2017 at 1:03 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> On Fri, 27 Oct 2017, Zack Weinberg wrote:
>
>> The code changes look good to me, except for a few small concerns
>> listed below.  Also, before landing I would like Joseph Myers (cc:ed)
>> to positively confirm that this patch introduces no new ISO C or POSIX
>> conformance issues, and will not tie our hands if this or a similar
>> feature is standardized in the future.
>
> I believe this patch properly handles namespace issues and avoids any
> problems with improperly changing enum values or making them improperly
> depend on feature test macros defined.  I believe it properly deals with
> allowing existing locales using existing POSIX features to continue to
> work by copying the month values as needed.

Thanks.

> There should be a GCC bug filed, if there isn't one already, for allowing
> %OB / %Ob in strftime formats (with appropriate warnings in ISO C pedantic
> modes about being an extension).  Tests may need to disable errors for
> format warnings when built with a GCC version without that support.

I don't see any such bug in GCC bugzilla.  Would you mind filing one?

> Also, the POSIX proposal accepted for issue 8 has strptime %Ob and
> strptime %OB handled the same, despite that POSIX proposal not including
> strftime %Ob.  (This is only an argument for %Ob handling for strptime to
> go in this patch rather than the later one adding alternative abbreviated
> months, however.)

Since the entire patch series should be pushed at once anyway, let's
not make additional work for Rafal.

Could you point me at the accepted POSIX proposal, please?

zw
  
Joseph Myers Oct. 27, 2017, 5:19 p.m. UTC | #4
On Fri, 27 Oct 2017, Zack Weinberg wrote:

> > There should be a GCC bug filed, if there isn't one already, for allowing
> > %OB / %Ob in strftime formats (with appropriate warnings in ISO C pedantic
> > modes about being an extension).  Tests may need to disable errors for
> > format warnings when built with a GCC version without that support.
> 
> I don't see any such bug in GCC bugzilla.  Would you mind filing one?

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82752 filed.

> > Also, the POSIX proposal accepted for issue 8 has strptime %Ob and
> > strptime %OB handled the same, despite that POSIX proposal not including
> > strftime %Ob.  (This is only an argument for %Ob handling for strptime to
> > go in this patch rather than the later one adding alternative abbreviated
> > months, however.)
> 
> Since the entire patch series should be pushed at once anyway, let's
> not make additional work for Rafal.
> 
> Could you point me at the accepted POSIX proposal, please?

http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major 
revision).
  
Zack Weinberg Oct. 27, 2017, 5:51 p.m. UTC | #5
On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com> wrote:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82752 filed.

Thanks.  I added a note about strptime.

> http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
> revision).

Hmm.  Should we, somehow, tell them about the need for %Ob?

zw
  
Joseph Myers Oct. 27, 2017, 7:25 p.m. UTC | #6
On Fri, 27 Oct 2017, Zack Weinberg wrote:

> On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> >
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82752 filed.
> 
> Thanks.  I added a note about strptime.

GCC doesn't have strptime format checking.

> > http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
> > revision).
> 
> Hmm.  Should we, somehow, tell them about the need for %Ob?

Yes, probably.  I believe anyone can create an account to file or comment 
on bugs.
  
Zack Weinberg Oct. 27, 2017, 8:16 p.m. UTC | #7
On Fri, Oct 27, 2017 at 3:25 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> On Fri, 27 Oct 2017, Zack Weinberg wrote:
>>> On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com> wrote:
>> > http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
>> > revision).
>>
>> Hmm.  Should we, somehow, tell them about the need for %Ob?
>
> Yes, probably.  I believe anyone can create an account to file or comment
> on bugs.

I have filed http://austingroupbugs.net/view.php?id=1166 .

zw
  
Joseph Myers Oct. 27, 2017, 8:30 p.m. UTC | #8
On Fri, 27 Oct 2017, Zack Weinberg wrote:

> On Fri, Oct 27, 2017 at 3:25 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> > On Fri, 27 Oct 2017, Zack Weinberg wrote:
> >>> On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> >> > http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
> >> > revision).
> >>
> >> Hmm.  Should we, somehow, tell them about the need for %Ob?
> >
> > Yes, probably.  I believe anyone can create an account to file or comment
> > on bugs.
> 
> I have filed http://austingroupbugs.net/view.php?id=1166 .

I note that includes ABALTMON_* constants in langinfo.h, whereas the 
present patch series has ALTMON_* constants but no ABALTMON_* names.
  
Zack Weinberg Oct. 27, 2017, 8:36 p.m. UTC | #9
On Fri, Oct 27, 2017 at 4:30 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> On Fri, 27 Oct 2017, Zack Weinberg wrote:
>
>> On Fri, Oct 27, 2017 at 3:25 PM, Joseph Myers <joseph@codesourcery.com> wrote:
>> > On Fri, 27 Oct 2017, Zack Weinberg wrote:
>> >>> On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com> wrote:
>> >> > http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
>> >> > revision).
>> >>
>> >> Hmm.  Should we, somehow, tell them about the need for %Ob?
>> >
>> > Yes, probably.  I believe anyone can create an account to file or comment
>> > on bugs.
>>
>> I have filed http://austingroupbugs.net/view.php?id=1166 .
>
> I note that includes ABALTMON_* constants in langinfo.h, whereas the
> present patch series has ALTMON_* constants but no ABALTMON_* names.

Rafal's patches have _NL_ABALTMON_*.  I am not clear on when we
actually want _NL_ prefixes on those constants, but I figured it was
simpler to write the proposal with no prefixes and the Austin Group
can add them if they want them.

zw
  
Florian Weimer Oct. 30, 2017, 11:21 a.m. UTC | #10
On 10/27/2017 06:29 PM, Zack Weinberg wrote:
> I am reviewing this patch on the assumption that we have consensus for
> the addition of this feature, with the semantics described above, and
> without any provision for old binaries (that is, the strings produced
> by %B will unconditionally change when locales that need two forms are
> updated).  Last call for objections.

I still think the approach of redefining the meaning of %B is wrong and 
will do more harm than good, but it's not a sustained objection.

(This is not a matter of supporting old binaries, I think the compat 
symbol approach would be even worse.)

Thanks,
Florian
  
Rafal Luzynski Nov. 6, 2017, 10:36 p.m. UTC | #11
27.10.2017 18:29 Zack Weinberg <zackw@panix.com> wrote:
> [...]
> I do not see any new tests anywhere in this patch series, and I found
> a bug in the support for %OB in strptime (see below). To test this
> feature we need at least one locale that actually uses it, so rather
> than gate *these* patches on the addition of tests, I am asking you to
> write comprehensive tests and include them with the first patch you
> submit that adds altmon names to a locale. (This locale will have to
> be one of the locales currently listed in localedata/Makefile as
> available to tests (the test-input variable, I think); if none of them
> need this feature, you may add one.)

I've browsed some test and I've found that a similar test already
exists: time/tst-strptime.c. [1] All we need is to add some more tests
for some more locales.  Of course we need to add more LOCALES to the
corresponding Makefile because now it contains only German, US English,
and Japanese.  I think I should add more than one locale.  Please confirm
that I'm thinking correctly.

Actually we don't need the updated locale data containing the actual
alternative month names.  In most cases the %B/%OB format specifiers
should work correctly in strptime() no matter if the input string contains
the nominative or genitive case because the algorithm searches for the
best match rather than for the equal string.

A longer explanation: The algorithm selects the month name which has
the longest matching initial substring with the input string, including
the terminating '\0' character.  Since in eastern European languages the
grammatical cases are made by appending or changing suffixes while the
stems (usually) remain the same, this algorithm is still able to
recognize the month name correctly.  Though, there are cases where
this will not work:

* Romance languages using prepositions e.g., Catalan "de novembre"
  and "d’octubre" will not be recognized correctly; their best match
  without the preposition will be always "desembre" (December)
  which is incorrect.
* Russian and Belarusian "мая" (of May) will match both "май" (May)
  and "март" (March), therefore it will be ambiguous and (if I think
  correctly) it will be recognized as March which is incorrect.

Of course it's worth to add such tests once the genitive month names
are actually added.

>
> [...]
> A note on writing ChangeLog entries: we really do want you to list
> every single new symbol that's part of the public API, not an
> abbreviated list -- that is:
>
> * locale/langinfo.h [__USE_GNU]: New public symbols ALTMON_1,
> ALTMON_2, ALTMON_3, ALTMON_4, ALTMON_5, ALTMON_6, ALTMON_7,
> ALTMON_8, ALTMON_9, ALTMON_10, ALTMON_11, ALTMON_12,
> _NL_WALTMON_1, _NL_WALTMON_2, _NL_WALTMON_3, _NL_WALTMON_4,
> _NL_WALTMON_5, _NL_WALTMON_6, _NL_WALTMON_7, _NL_WALTMON_8,
> _NL_WALTMON_9, _NL_WALTMON_10, _NL_WALTMON_11, _NL_WALTMON_12.

Applied locally.  Although I have some doubts.  Only ALTMON_x
symbols are defined conditionally (#ifdef __USE_GNU) while
_NL_WALTMON_x are defined unconditionally.  Additionally, there
are __ALTMON_x symbols, also unconditional, defined only because
ALTMON_x are not (yet) defined by POSIX. [2] [3] Should the
ChangeLog be more detailed?

> [ cut longer explanation about ChangeLog ]

Another longer explanation which I find worth copying to wiki,
again if you don't mind, Zack.

> [...]
> Another minor procedural thing: don't include diffs to the file
> ChangeLog in commits sent for review, because if a reviewer wants to
> apply the patch and tinker with it, they don't want to have to clean
> up the inevitable merge botch in the ChangeLog.

OK, again worth copying to wiki.

> > @@ -402,6 +404,20 @@ __strptime_internal (const char *rp, const char *fmt,
> > struct tm *tmp,
> > if (s.decided !=raw)
> > {
> > trp = rp;
> > +#ifdef _LIBC
> > + /* First check the alt month. */
>
> Here you are checking the alt-month name before the month name ...
>
> > @@ -428,6 +444,10 @@ __strptime_internal (const char *rp, const char *fmt,
> > struct tm *tmp,
> > if (s.decided != loc
> > && (((trp = rp, match_string (month_name[cnt], trp))
> > && trp > rp_longest)
> > +#ifdef _LIBC
> > + || ((trp = rp, match_string (alt_month_name[cnt], trp))
> > + && trp > rp_longest)
> > +#endif
> > || ((trp = rp, match_string (ab_month_name[cnt], trp))
> > && trp > rp_longest)))
>
> ... and here you are checking the alt-month name after the month name.
> Please make this consistent one way or the other. (I *think* the
> order of checks would only ever matter if the alt-month name for month
> X were the same as the month name for month Y, which would be a nasty
> ambiguity in the relevant natural language -- but nasty ambiguities do
> actually happen in natural languages, so we need to be careful.)

Indeed, it should not matter because the algorithm searches for the
best matching string which should be only one.  If there are two best
matching strings then the input string is ambiguous and cannot be
determined, probably the reason is that the input string is wrong.
If a language has a string which means two different months depending
on the context then the native speakers have a trouble understanding it.
That means, it's unlikely that a language has such a feature.  But
I agree, "unlikely" is not "impossible".  Please note that so far
I've identified only about 20 languages (out of about 200 supported
by glibc) which need the nominative/genitive difference in month
names, they are in my github repo. [4] It is possible to verify them
manually and I think that I did it in the past although I'm not
sure now.

But I will apply your remark and introduce the consistency to make
the code more readable.  So far I was focused on making the *patches*
more readable.

Note that the question "what should be matched first: the basic or
the regular month name?" does not have a good answer.  The real good
answer should be "alternative first if the O modifier is present,
basic first otherwise" but since the algorithm drops the modifier
and treats all specifiers: B, b, and h identically it is difficult
to tell.  Is it worth to store this information and provide different
paths for the specifiers with the O modifier and without it?  I don't
think so.  Please, note above, most probably that case does not
actually exist.

While at this, I'm somehow concerned by the waste of the CPU cycles
to check both basic and alternative month names while in 90% languages
these strings will always be the same.  But I think that checking
whether the current locale actually requires the alternative month
names and providing separate paths for them would be only worse.

> I wonder whether we actually need the #ifdef _LIBC here -- but that's
> an independent question (is this code _really_ still shared with
> gnulib, or have they diverged to the point where we should stop trying
> to keep the files in sync?) and you are not on the hook to answer it.

OK, I'm unable to answer this reliably but I think that gnulib should
follow this change as well.  The reason is that the command line utility
date uses gnulib rather than glibc.  So if we implement the change in
glibc but do not implement it in other libraries and programs the change
will be incomplete.

> > @@ -1015,6 +1035,10 @@ __strptime_internal (const char *rp, const char *fmt,
> > struct tm *tmp,
> > case 'O':
> > switch (*fmt++)
> > {
> > + case 'B':
> > + /* Undo the increment and continue. */
> > + fmt--;
> > + break;
>
> This is subtly wrong, and I had to read a big chunk of the entire
> function to understand what it was meant to do and how it doesn't
> actually work.

Oops... :-(

Thank you for spotting, analyzing and fixing this.

> Please change to
>
> ] case 'O':
> ] switch (*fmt++)
> ] {
> ] + case 'B':
> ] + /* Match month name. Reprocess as plain 'B'. */
> ] + fmt--;
> ] + goto start_over;
>
> You will also need to remove the `#ifndef _NL_CURRENT` wrapping the
> definition of the `start_over` label, and change its comment:
>
> ] -#ifndef _NL_CURRENT
> ] - /* We need this for handling the `E' modifier. */
> ] + /* In some cases, modifiers are handled by adjusting state and
> ] + then restarting the switch statement below. */
> ] start_over:
> ] -#endif
> ]
> ] /* Make back up of current processing pointer. */

I will analyze it more thoroughly but at the first sight your fixes
are correct and do not require any further changes.

> (Am I seriously telling you to use `goto`? Yes, I am. [...]

I'm OK with goto if there are good reasons to use it, especially
in the core libraries.  Most of the time there are better solutions
but this time your reasons are good.

Thank you again, regards,

Rafal


[1] https://sourceware.org/git/?p=glibc.git;a=blob;f=time/tst-strptime.c;hb=HEAD
[2] https://sourceware.org/ml/libc-alpha/2016-10/msg00303.html
[3] https://sourceware.org/ml/libc-alpha/2016-12/msg01065.html
[4] https://github.com/rluzynski/glibc
  
Rafal Luzynski Nov. 6, 2017, 10:52 p.m. UTC | #12
27.10.2017 19:03 Joseph Myers <joseph@codesourcery.com> wrote:
> [...]
> Also, the POSIX proposal accepted for issue 8 has strptime %Ob and
> strptime %OB handled the same, despite that POSIX proposal not including
> strftime %Ob. (This is only an argument for %Ob handling for strptime to
> go in this patch rather than the later one adding alternative abbreviated
> months, however.)
>
> --
> Joseph S. Myers
> joseph@codesourcery.com

This may be an omission at POSIX side but I also think that the good
reason why all variants (B, b, h, OB, Ob, Oh) should be treated the same
is that the strptime() algorithm should be more forgiving and should
not throw the errors like "yes, I recognize correctly that Nov is
a shortcut for November but the format specifier says you were supposed
to provide a full version therefore I report an error here" (also swap
full/abbreviated, replace with basic/alternative etc.)

Regards,

Rafal
  
Rafal Luzynski Nov. 6, 2017, 11:01 p.m. UTC | #13
27.10.2017 22:16 Zack Weinberg <zackw@panix.com> wrote:
>
>
> On Fri, Oct 27, 2017 at 3:25 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> > On Fri, 27 Oct 2017, Zack Weinberg wrote:
> >>> On Fri, Oct 27, 2017 at 1:19 PM, Joseph Myers <joseph@codesourcery.com>
> >>> wrote:
> >> > http://austingroupbugs.net/view.php?id=258 (issue8 tag = for next major
> >> > revision).
> >>
> >> Hmm. Should we, somehow, tell them about the need for %Ob?
> >
> > Yes, probably. I believe anyone can create an account to file or comment
> > on bugs.
>
> I have filed http://austingroupbugs.net/view.php?id=1166 .

Thank you Zack for filing this, also thank you Joseph for filing an
issue against GCC.  Your reports are perfect and prove that you understand
the problem.

My only concern is that both reports in austingroupbugs still refer to
the actual grammatical cases (genitive and nominative) which already lead
to some misunderstandings here in this list.  It has been stated that
it's more correct to refer to "the form correct when the month is used
to format a full date" and "the form correct when the month is named
by itself".  In some languages the genitive case exists and it's incorrect
to use it when formatting a full date.

Regards,

Rafal
  
Rafal Luzynski Nov. 6, 2017, 11:08 p.m. UTC | #14
27.10.2017 22:36 Zack Weinberg <zackw@panix.com> wrote:
> On Fri, Oct 27, 2017 at 4:30 PM, Joseph Myers <joseph@codesourcery.com> wrote:
> > On Fri, 27 Oct 2017, Zack Weinberg wrote:
> >> [...]
> >> I have filed http://austingroupbugs.net/view.php?id=1166 .
> >
> > I note that includes ABALTMON_* constants in langinfo.h, whereas the
> > present patch series has ALTMON_* constants but no ABALTMON_* names.
>
> Rafal's patches have _NL_ABALTMON_*. I am not clear on when we
> actually want _NL_ prefixes on those constants, but I figured it was
> simpler to write the proposal with no prefixes and the Austin Group
> can add them if they want them.

If I understand correctly, _NL_* prefixes are used for our own extensions
while the constants without the prefixes are standardized by POSIX.

I can't define ABALTMON_* constants before POSIX accepts and publishes
the change.  Should I define them now when the issue is filed?  Can
I assume that if the change is filed it will be eventually accepted
just because it looks reasonable for us?  I don't think so.  It may take
multiple years.  How should we prepare for the potential change in the
distant future?  Is it OK to leave it as it is now and assume that
ABALTMON_* symbols will be defined (as aliases) when they are needed?

Regards,

Rafal
  
Rafal Luzynski Nov. 15, 2017, 10:32 a.m. UTC | #15
6.11.2017 23:36 Rafal Luzynski <digitalfreak@lingonborough.com> wrote:
> [...]
> Actually we don't need the updated locale data containing the actual
> alternative month names. In most cases the %B/%OB format specifiers
> should work correctly in strptime() no matter if the input string contains
> the nominative or genitive case because the algorithm searches for the
> best match rather than for the equal string.
>
> A longer explanation: The algorithm selects the month name which has
> the longest matching initial substring with the input string, including
> the terminating '\0' character. Since in eastern European languages the
> grammatical cases are made by appending or changing suffixes while the
> stems (usually) remain the same, this algorithm is still able to
> recognize the month name correctly. [...]

I made some tests locally and it turns out I was wrong here.  The
genitive forms (in eastern European languages) will not be recognized
by the current locales just because they look similar to the nominative
forms.  For example, in Polish language (I choose the simplest word)
the word for May is "maj", the genitive form is "maja".  When we try
to parse "maja" with strptime("%B", ...) we get the substring "maj"
correctly recognized as the 5th month while the letter "a" remains
unparsed which eventually raises an error.  I incorrectly thought
that strptime() matches the whole word and returns the index of the
word from the repository (from nl_langinfo() results) which has the
longest matching substring (whole string is the best).  Actually
it matches the longest substring and leaves the rest of the word
unparsed.

Shortly, only the words which are actually present in the repository
are recognized as the month names, or their initial substrings.
The words which partially match raise errors.

On the other hand, "%OB" format specifier should work fine
(as well as "%B") as long as we use only the nominative cases
in the input strings.

Regards,

Rafal
  

Patch

diff --git a/ChangeLog b/ChangeLog
index 3b8e6c5..b0636ff 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,23 @@ 
 2017-09-19  Rafal Luzynski  <digitalfreak@lingonborough.com>
 
+	[BZ #10871]
+	* locale/C-time.c: Add alternative month names, define them as the
+	same as mon explicitly.
+	* locale/categories.def: alt_mon and wide-alt_mon added.
+	* locale/langinfo.h: ALTMON_1 .. ALTMON_12 and similar contants
+	defined.
+	* locale/programs/ld-time.c: Alternative month names support
+	added, they are a copy of mon if not specified explicitly.
+	* locale/programs/locfile-kw.gperf: alt_mon defined.
+	* locale/programs/locfile-token.h: tok_alt_mon defined.
+	* localedata/tst-langinfo.c: Add tests for the new constants
+	ALTMON_1 .. ALTMON_12.
+	* time/strftime_l.c: %OB format for alternative month names
+	added.
+	* time/strptime_l.c: Alternative month names also recognized.
+
+2017-09-19  Rafal Luzynski  <digitalfreak@lingonborough.com>
+
 	* locale/loadlocale.c: Correct size of
 	_nl_value_type_LC_<category> arrays.
 
diff --git a/locale/C-time.c b/locale/C-time.c
index 31d8704..ee33652 100644
--- a/locale/C-time.c
+++ b/locale/C-time.c
@@ -30,7 +30,7 @@  const struct __locale_data _nl_C_LC_TIME attribute_hidden =
   { NULL, },			/* no cached data */
   UNDELETABLE,
   0,
-  111,
+  135,
   {
     { .string = "Sun" },
     { .string = "Mon" },
@@ -142,6 +142,30 @@  const struct __locale_data _nl_C_LC_TIME attribute_hidden =
     { .string = "" },
     { .string = "%a %b %e %H:%M:%S %Z %Y" },
     { .wstr = (const uint32_t *) L"%a %b %e %H:%M:%S %Z %Y" },
-    { .string = _nl_C_codeset }
+    { .string = _nl_C_codeset },
+    { .string = "January" },
+    { .string = "February" },
+    { .string = "March" },
+    { .string = "April" },
+    { .string = "May" },
+    { .string = "June" },
+    { .string = "July" },
+    { .string = "August" },
+    { .string = "September" },
+    { .string = "October" },
+    { .string = "November" },
+    { .string = "December" },
+    { .wstr = (const uint32_t *) L"January" },
+    { .wstr = (const uint32_t *) L"February" },
+    { .wstr = (const uint32_t *) L"March" },
+    { .wstr = (const uint32_t *) L"April" },
+    { .wstr = (const uint32_t *) L"May" },
+    { .wstr = (const uint32_t *) L"June" },
+    { .wstr = (const uint32_t *) L"July" },
+    { .wstr = (const uint32_t *) L"August" },
+    { .wstr = (const uint32_t *) L"September" },
+    { .wstr = (const uint32_t *) L"October" },
+    { .wstr = (const uint32_t *) L"November" },
+    { .wstr = (const uint32_t *) L"December" }
   }
 };
diff --git a/locale/categories.def b/locale/categories.def
index 27a6129..53ec8c5 100644
--- a/locale/categories.def
+++ b/locale/categories.def
@@ -249,6 +249,8 @@  DEFINE_CATEGORY
   DEFINE_ELEMENT (_DATE_FMT,                "date_fmt",            opt, string)
   DEFINE_ELEMENT (_NL_W_DATE_FMT,           "wide-date_fmt",       opt,
wstring)
   DEFINE_ELEMENT (_NL_TIME_CODESET,	    "time-codeset",	   std, string)
+  DEFINE_ELEMENT (ALTMON_1,       "alt_mon",       opt, stringarray, 12, 12)
+  DEFINE_ELEMENT (_NL_WALTMON_1,  "wide-alt_mon",  opt, wstringarray, 12, 12)
   ), NO_POSTLOAD)
 
 
diff --git a/locale/langinfo.h b/locale/langinfo.h
index 1403957..78103ce 100644
--- a/locale/langinfo.h
+++ b/locale/langinfo.h
@@ -100,7 +100,8 @@  enum
   ABMON_12,
 #define ABMON_12		ABMON_12
 
-  /* Long month names.  */
+  /* Long month names, in the grammatical form used when the month
+     forms part of a complete date.  */
   MON_1,			/* January */
 #define MON_1			MON_1
   MON_2,
@@ -189,7 +190,8 @@  enum
   _NL_WABMON_11,
   _NL_WABMON_12,
 
-  /* Long month names.  */
+  /* Long month names, in the grammatical form used when the month
+     forms part of a complete date.  */
   _NL_WMON_1,		/* January */
   _NL_WMON_2,
   _NL_WMON_3,
@@ -231,6 +233,50 @@  enum
 
   _NL_TIME_CODESET,
 
+  /* Long month names, in the grammatical form used when the month
+     is named by itself.  */
+  __ALTMON_1,			/* January */
+  __ALTMON_2,
+  __ALTMON_3,
+  __ALTMON_4,
+  __ALTMON_5,
+  __ALTMON_6,
+  __ALTMON_7,
+  __ALTMON_8,
+  __ALTMON_9,
+  __ALTMON_10,
+  __ALTMON_11,
+  __ALTMON_12,
+#ifdef __USE_GNU
+# define ALTMON_1		__ALTMON_1
+# define ALTMON_2		__ALTMON_2
+# define ALTMON_3		__ALTMON_3
+# define ALTMON_4		__ALTMON_4
+# define ALTMON_5		__ALTMON_5
+# define ALTMON_6		__ALTMON_6
+# define ALTMON_7		__ALTMON_7
+# define ALTMON_8		__ALTMON_8
+# define ALTMON_9		__ALTMON_9
+# define ALTMON_10		__ALTMON_10
+# define ALTMON_11		__ALTMON_11
+# define ALTMON_12		__ALTMON_12
+#endif
+
+  /* Long month names, in the grammatical form used when the month
+     is named by itself.  */
+  _NL_WALTMON_1,			/* January */
+  _NL_WALTMON_2,
+  _NL_WALTMON_3,
+  _NL_WALTMON_4,
+  _NL_WALTMON_5,
+  _NL_WALTMON_6,
+  _NL_WALTMON_7,
+  _NL_WALTMON_8,
+  _NL_WALTMON_9,
+  _NL_WALTMON_10,
+  _NL_WALTMON_11,
+  _NL_WALTMON_12,
+
   _NL_NUM_LC_TIME,	/* Number of indices in LC_TIME category.  */
 
   /* LC_COLLATE category: text sorting.
diff --git a/locale/programs/ld-time.c b/locale/programs/ld-time.c
index 32e9c41..0383179 100644
--- a/locale/programs/ld-time.c
+++ b/locale/programs/ld-time.c
@@ -91,6 +91,9 @@  struct locale_time_t
   const char *date_fmt;
   const uint32_t *wdate_fmt;
   int alt_digits_defined;
+  const char *alt_mon[12];
+  const uint32_t *walt_mon[12];
+  int alt_mon_defined;
   unsigned char week_ndays;
   uint32_t week_1stday;
   unsigned char week_1stweek;
@@ -652,6 +655,15 @@  time_output (struct localedef_t *locale, const struct
charmap_t *charmap,
   add_locale_string (&file, time->date_fmt);
   add_locale_wstring (&file, time->wdate_fmt);
   add_locale_string (&file, charmap->code_set_name);
+
+  /* The alt'mons.  */
+  for (n = 0; n < 12; ++n)
+    add_locale_string (&file, time->alt_mon[n] ?: "");
+
+  /* The wide character alt'mons.  */
+  for (n = 0; n < 12; ++n)
+    add_locale_wstring (&file, time->walt_mon[n] ?: empty_wstr);
+
   write_locale_data (output_path, LC_TIME, "LC_TIME", &file);
 }
 
@@ -795,6 +807,7 @@  time_read (struct linereader *ldfile, struct localedef_t
*result,
 	  STRARR_ELEM (mon, 12, 12);
 	  STRARR_ELEM (am_pm, 2, 2);
 	  STRARR_ELEM (alt_digits, 0, 100);
+	  STRARR_ELEM (alt_mon, 12, 12);
 
 	case tok_era:
 	  /* Ignore the rest of the line if we don't need the input of
@@ -947,6 +960,14 @@  time_read (struct linereader *ldfile, struct localedef_t
*result,
 	    lr_error (ldfile, _("\
 %1$s: definition does not end with `END %1$s'"), "LC_TIME");
 	  lr_ignore_rest (ldfile, now->tok == tok_lc_time);
+
+	  /* If alt_mon was not specified, make it a copy of mon.  */
+	  if (!ignore_content && !time->alt_mon_defined)
+	    {
+	      memcpy (time->alt_mon, time->mon, sizeof (time->mon));
+	      memcpy (time->walt_mon, time->wmon, sizeof (time->wmon));
+	      time->alt_mon_defined = 1;
+	    }
 	  return;
 
 	default:
diff --git a/locale/programs/locfile-kw.gperf b/locale/programs/locfile-kw.gperf
index 3605d15..3d11cc6 100644
--- a/locale/programs/locfile-kw.gperf
+++ b/locale/programs/locfile-kw.gperf
@@ -148,6 +148,7 @@  first_workday,          tok_first_workday,          0
 cal_direction,          tok_cal_direction,          0
 timezone,               tok_timezone,               0
 date_fmt,               tok_date_fmt,               0
+alt_mon,                tok_alt_mon,                0
 LC_MESSAGES,            tok_lc_messages,            0
 yesexpr,                tok_yesexpr,                0
 noexpr,                 tok_noexpr,                 0
diff --git a/locale/programs/locfile-token.h b/locale/programs/locfile-token.h
index 0c32f2c..2a313b2 100644
--- a/locale/programs/locfile-token.h
+++ b/locale/programs/locfile-token.h
@@ -186,6 +186,7 @@  enum token_t
   tok_cal_direction,
   tok_timezone,
   tok_date_fmt,
+  tok_alt_mon,
   tok_lc_messages,
   tok_yesexpr,
   tok_noexpr,
diff --git a/localedata/tst-langinfo.c b/localedata/tst-langinfo.c
index 1012f56..c23d9e0 100644
--- a/localedata/tst-langinfo.c
+++ b/localedata/tst-langinfo.c
@@ -50,6 +50,18 @@  struct map
   VAL (ABMON_8),
   VAL (ABMON_9),
   VAL (ALT_DIGITS),
+  VAL (ALTMON_1),
+  VAL (ALTMON_10),
+  VAL (ALTMON_11),
+  VAL (ALTMON_12),
+  VAL (ALTMON_2),
+  VAL (ALTMON_3),
+  VAL (ALTMON_4),
+  VAL (ALTMON_5),
+  VAL (ALTMON_6),
+  VAL (ALTMON_7),
+  VAL (ALTMON_8),
+  VAL (ALTMON_9),
   VAL (AM_STR),
   VAL (CRNCYSTR),
   VAL (CURRENCY_SYMBOL),
diff --git a/time/strftime_l.c b/time/strftime_l.c
index b5ba9ca..1c4bed8 100644
--- a/time/strftime_l.c
+++ b/time/strftime_l.c
@@ -492,6 +492,9 @@  __strftime_internal (CHAR_T *s, size_t maxsize, const CHAR_T
*format,
 # define f_month \
   ((const CHAR_T *) (tp->tm_mon < 0 || tp->tm_mon > 11			     \
 		     ? "?" : _NL_CURRENT (LC_TIME, NLW(MON_1) + tp->tm_mon)))
+# define f_altmonth \
+  ((const CHAR_T *) (tp->tm_mon < 0 || tp->tm_mon > 11			     \
+		     ? "?" : _NL_CURRENT (LC_TIME, NLW(ALTMON_1) + tp->tm_mon)))
 # define ampm \
   ((const CHAR_T *) _NL_CURRENT (LC_TIME, tp->tm_hour > 11		      \
 				 ? NLW(PM_STR) : NLW(AM_STR)))
@@ -507,6 +510,7 @@  __strftime_internal (CHAR_T *s, size_t maxsize, const CHAR_T
*format,
 		   ? "?" : month_name[tp->tm_mon])
 #  define a_wkday f_wkday
 #  define a_month f_month
+#  define f_altmonth f_month
 #  define ampm (L_("AMPM") + 2 * (tp->tm_hour > 11))
 
   size_t aw_len = 3;
@@ -785,7 +789,7 @@  __strftime_internal (CHAR_T *s, size_t maxsize, const CHAR_T
*format,
 #endif
 
 	case L_('B'):
-	  if (modifier != 0)
+	  if (modifier == L_('E'))
 	    goto bad_format;
 	  if (change_case)
 	    {
@@ -793,7 +797,10 @@  __strftime_internal (CHAR_T *s, size_t maxsize, const
CHAR_T *format,
 	      to_lowcase = 0;
 	    }
 #if defined _NL_CURRENT || !HAVE_STRFTIME
-	  cpy (STRLEN (f_month), f_month);
+	  if (modifier == L_('O'))
+	    cpy (STRLEN (f_altmonth), f_altmonth);
+	  else
+	    cpy (STRLEN (f_month), f_month);
 	  break;
 #else
 	  goto underlying_strftime;
diff --git a/time/strptime_l.c b/time/strptime_l.c
index 3afc33a..b99f5d2 100644
--- a/time/strptime_l.c
+++ b/time/strptime_l.c
@@ -124,6 +124,8 @@  extern const struct __locale_data _nl_C_LC_TIME
attribute_hidden;
   (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (ABDAY_1)].string)
 # define month_name (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (MON_1)].string)
 # define ab_month_name (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (ABMON_1)].string)
+# define alt_month_name \
+  (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (ALTMON_1)].string)
 # define HERE_D_T_FMT (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (D_T_FMT)].string)
 # define HERE_D_FMT (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (D_FMT)].string)