handle casing of i/I in Turkic languages

Message ID 9cea6e5e-796d-4e61-9c49-5bd050d80081@towo.net
State New
Headers
Series handle casing of i/I in Turkic languages |

Commit Message

Thomas Wolff Feb. 2, 2026, 1:21 p.m. UTC
  From 99df2722cb39dcf0290dfcd14d7e81ec87853e15 Mon Sep 17 00:00:00 2001
From: Thomas Wolff <towo@towo.net>
Date: Mon, 2 Feb 2026 00:00:00 +0000
Subject: [PATCH] towupper/towlower: handle special casing of "i"/"I" for
 Turkic languages

---
 newlib/libc/ctype/towctrans.c   |  2 +-
 newlib/libc/ctype/towctrans_l.c | 23 +++++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)
  

Comments

Corinna Vinschen Feb. 2, 2026, 1:45 p.m. UTC | #1
Hi Thomas,


thanks for the patch.  Just one point:

On Feb  2 14:21, Thomas Wolff wrote:

> From 99df2722cb39dcf0290dfcd14d7e81ec87853e15 Mon Sep 17 00:00:00 2001
> From: Thomas Wolff <towo@towo.net>
> Date: Mon, 2 Feb 2026 00:00:00 +0000
> Subject: [PATCH] towupper/towlower: handle special casing of "i"/"I" for
>  Turkic languages

This is a non-trivial patch, so it would be nice to have a non-trivial
commit message explaining what your doing and why.  To the uninformed
reader, the explicit exception for the turk language looks arbitrary.

Care to add a bit of text?


Thanks,
Corinna



> 
> ---
>  newlib/libc/ctype/towctrans.c   |  2 +-
>  newlib/libc/ctype/towctrans_l.c | 23 +++++++++++++++++++----
>  2 files changed, 20 insertions(+), 5 deletions(-)
> 
> diff --git a/newlib/libc/ctype/towctrans.c b/newlib/libc/ctype/towctrans.c
> index 176aa3d9d..2cd34184e 100644
> --- a/newlib/libc/ctype/towctrans.c
> +++ b/newlib/libc/ctype/towctrans.c
> @@ -81,7 +81,7 @@ _towctrans_r (struct _reent *r,
>  	wctrans_t w)
>  {
>    if (w == WCT_TOLOWER || w == WCT_TOUPPER)
> -    return towctrans_l (c, w, 0);
> +    return towctrans_l (c, w, LC_GLOBAL_LOCALE);
>    else
>      {
>        // skipping this because it was causing trouble (cygwin crash)
> diff --git a/newlib/libc/ctype/towctrans_l.c b/newlib/libc/ctype/towctrans_l.c
> index e94d6f492..2b843f302 100644
> --- a/newlib/libc/ctype/towctrans_l.c
> +++ b/newlib/libc/ctype/towctrans_l.c
> @@ -72,9 +72,21 @@ bisearch (wint_t ucs, const struct caseconv_entry *table, int max)
>    return 0;
>  }
>  
> +static int
> +isturk (struct __locale_t *locale)
> +{
> +  const char * loc = getlocalename_l (LC_CTYPE, locale);
> +  if (!loc)
> +    return 0;
> +  return 0 == strncmp (loc, "tr", 2) || 0 == strncmp (loc, "az", 2);
> +}
> +
>  static wint_t
> -toulower (wint_t c)
> +toulower (wint_t c, struct __locale_t *locale)
>  {
> +  if (c == 'I' && isturk (locale))
> +    return 0x131; // LATIN SMALL LETTER DOTLESS I
> +
>    const struct caseconv_entry * cce =
>      bisearch(c, caseconv_table,
>               sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
> @@ -108,8 +120,11 @@ toulower (wint_t c)
>  }
>  
>  static wint_t
> -touupper (wint_t c)
> +touupper (wint_t c, struct __locale_t *locale)
>  {
> +  if (c == 'i' && isturk (locale))
> +    return 0x130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
> +
>    const struct caseconv_entry * cce =
>      bisearch(c, caseconv_table,
>               sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
> @@ -151,9 +166,9 @@ towctrans_l (wint_t c, wctrans_t w, struct __locale_t *locale)
>    wint_t u = _jp2uc_l (c, locale);
>    wint_t res;
>    if (w == WCT_TOLOWER)
> -    res = toulower (u);
> +    res = toulower (u, locale);
>    else if (w == WCT_TOUPPER)
> -    res = touupper (u);
> +    res = touupper (u, locale);
>    else
>      {
>        // skipping the errno setting that was previously involved
> -- 
> 2.51.0
>
  
Thomas Wolff Feb. 2, 2026, 5:05 p.m. UTC | #2
Am 02.02.2026 um 14:45 schrieb Corinna Vinschen:
> Hi Thomas,
>
>
> thanks for the patch.  Just one point:
>
> On Feb  2 14:21, Thomas Wolff wrote:
>
>>  From 99df2722cb39dcf0290dfcd14d7e81ec87853e15 Mon Sep 17 00:00:00 2001
>> From: Thomas Wolff <towo@towo.net>
>> Date: Mon, 2 Feb 2026 00:00:00 +0000
>> Subject: [PATCH] towupper/towlower: handle special casing of "i"/"I" for
>>   Turkic languages
> This is a non-trivial patch, so it would be nice to have a non-trivial
> commit message explaining what your doing and why.  To the uninformed
> reader, the explicit exception for the turk language looks arbitrary.
>
> Care to add a bit of text?
Quite a bit, very well, as attached.
Thomas

> Thanks,
> Corinna
>
>
>
>> ---
>>   newlib/libc/ctype/towctrans.c   |  2 +-
>>   newlib/libc/ctype/towctrans_l.c | 23 +++++++++++++++++++----
>>   2 files changed, 20 insertions(+), 5 deletions(-)
>>
>> diff --git a/newlib/libc/ctype/towctrans.c b/newlib/libc/ctype/towctrans.c
>> index 176aa3d9d..2cd34184e 100644
>> --- a/newlib/libc/ctype/towctrans.c
>> +++ b/newlib/libc/ctype/towctrans.c
>> @@ -81,7 +81,7 @@ _towctrans_r (struct _reent *r,
>>   	wctrans_t w)
>>   {
>>     if (w == WCT_TOLOWER || w == WCT_TOUPPER)
>> -    return towctrans_l (c, w, 0);
>> +    return towctrans_l (c, w, LC_GLOBAL_LOCALE);
>>     else
>>       {
>>         // skipping this because it was causing trouble (cygwin crash)
>> diff --git a/newlib/libc/ctype/towctrans_l.c b/newlib/libc/ctype/towctrans_l.c
>> index e94d6f492..2b843f302 100644
>> --- a/newlib/libc/ctype/towctrans_l.c
>> +++ b/newlib/libc/ctype/towctrans_l.c
>> @@ -72,9 +72,21 @@ bisearch (wint_t ucs, const struct caseconv_entry *table, int max)
>>     return 0;
>>   }
>>   
>> +static int
>> +isturk (struct __locale_t *locale)
>> +{
>> +  const char * loc = getlocalename_l (LC_CTYPE, locale);
>> +  if (!loc)
>> +    return 0;
>> +  return 0 == strncmp (loc, "tr", 2) || 0 == strncmp (loc, "az", 2);
>> +}
>> +
>>   static wint_t
>> -toulower (wint_t c)
>> +toulower (wint_t c, struct __locale_t *locale)
>>   {
>> +  if (c == 'I' && isturk (locale))
>> +    return 0x131; // LATIN SMALL LETTER DOTLESS I
>> +
>>     const struct caseconv_entry * cce =
>>       bisearch(c, caseconv_table,
>>                sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
>> @@ -108,8 +120,11 @@ toulower (wint_t c)
>>   }
>>   
>>   static wint_t
>> -touupper (wint_t c)
>> +touupper (wint_t c, struct __locale_t *locale)
>>   {
>> +  if (c == 'i' && isturk (locale))
>> +    return 0x130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
>> +
>>     const struct caseconv_entry * cce =
>>       bisearch(c, caseconv_table,
>>                sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
>> @@ -151,9 +166,9 @@ towctrans_l (wint_t c, wctrans_t w, struct __locale_t *locale)
>>     wint_t u = _jp2uc_l (c, locale);
>>     wint_t res;
>>     if (w == WCT_TOLOWER)
>> -    res = toulower (u);
>> +    res = toulower (u, locale);
>>     else if (w == WCT_TOUPPER)
>> -    res = touupper (u);
>> +    res = touupper (u, locale);
>>     else
>>       {
>>         // skipping the errno setting that was previously involved
>> -- 
>> 2.51.0
>>
From 99df2722cb39dcf0290dfcd14d7e81ec87853e15 Mon Sep 17 00:00:00 2001
From: Thomas Wolff <towo@towo.net>
Date: Mon, 2 Feb 2026 00:00:00 +0000
Subject: [PATCH] towupper/towlower: handle Turkic language special casing

For case conversion, Unicode has a standard mapping and a separate 
list of mapping rules for special cases (file SpecialCasing.txt), 
some of which are also language-dependent (as configured via locale).
However, most of these rules are context-dependent, e.g. Greek capital 
Sigma is lowered to two different small sigmas, depending on the 
position at the end of a word.
The POSIX API function towupper and tolower cannot consider context 
as they work only on one character at a time. String casing functions 
are unfortunately not available. The only special case conversions 
that apply to a single character are i and I in Turkish and Azerbaijani, 
where i keeps the dot when capitalised (U+0130) and I keeps not having 
a dot when converted small (U+0131).
The patch handles these special cases, based on locale consideration.

---
 newlib/libc/ctype/towctrans.c   |  2 +-
 newlib/libc/ctype/towctrans_l.c | 23 +++++++++++++++++++----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/newlib/libc/ctype/towctrans.c b/newlib/libc/ctype/towctrans.c
index 176aa3d9d..2cd34184e 100644
--- a/newlib/libc/ctype/towctrans.c
+++ b/newlib/libc/ctype/towctrans.c
@@ -81,7 +81,7 @@ _towctrans_r (struct _reent *r,
 	wctrans_t w)
 {
   if (w == WCT_TOLOWER || w == WCT_TOUPPER)
-    return towctrans_l (c, w, 0);
+    return towctrans_l (c, w, LC_GLOBAL_LOCALE);
   else
     {
       // skipping this because it was causing trouble (cygwin crash)
diff --git a/newlib/libc/ctype/towctrans_l.c b/newlib/libc/ctype/towctrans_l.c
index e94d6f492..2b843f302 100644
--- a/newlib/libc/ctype/towctrans_l.c
+++ b/newlib/libc/ctype/towctrans_l.c
@@ -72,9 +72,21 @@ bisearch (wint_t ucs, const struct caseconv_entry *table, int max)
   return 0;
 }
 
+static int
+isturk (struct __locale_t *locale)
+{
+  const char * loc = getlocalename_l (LC_CTYPE, locale);
+  if (!loc)
+    return 0;
+  return 0 == strncmp (loc, "tr", 2) || 0 == strncmp (loc, "az", 2);
+}
+
 static wint_t
-toulower (wint_t c)
+toulower (wint_t c, struct __locale_t *locale)
 {
+  if (c == 'I' && isturk (locale))
+    return 0x131; // LATIN SMALL LETTER DOTLESS I
+
   const struct caseconv_entry * cce =
     bisearch(c, caseconv_table,
              sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
@@ -108,8 +120,11 @@ toulower (wint_t c)
 }
 
 static wint_t
-touupper (wint_t c)
+touupper (wint_t c, struct __locale_t *locale)
 {
+  if (c == 'i' && isturk (locale))
+    return 0x130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
+
   const struct caseconv_entry * cce =
     bisearch(c, caseconv_table,
              sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
@@ -151,9 +166,9 @@ towctrans_l (wint_t c, wctrans_t w, struct __locale_t *locale)
   wint_t u = _jp2uc_l (c, locale);
   wint_t res;
   if (w == WCT_TOLOWER)
-    res = toulower (u);
+    res = toulower (u, locale);
   else if (w == WCT_TOUPPER)
-    res = touupper (u);
+    res = touupper (u, locale);
   else
     {
       // skipping the errno setting that was previously involved
  
Corinna Vinschen Feb. 2, 2026, 5:32 p.m. UTC | #3
On Feb  2 18:05, Thomas Wolff wrote:
> 
> 
> Am 02.02.2026 um 14:45 schrieb Corinna Vinschen:
> > Hi Thomas,
> > 
> > 
> > thanks for the patch.  Just one point:
> > 
> > On Feb  2 14:21, Thomas Wolff wrote:
> > 
> > >  From 99df2722cb39dcf0290dfcd14d7e81ec87853e15 Mon Sep 17 00:00:00 2001
> > > From: Thomas Wolff <towo@towo.net>
> > > Date: Mon, 2 Feb 2026 00:00:00 +0000
> > > Subject: [PATCH] towupper/towlower: handle special casing of "i"/"I" for
> > >   Turkic languages
> > This is a non-trivial patch, so it would be nice to have a non-trivial
> > commit message explaining what your doing and why.  To the uninformed
> > reader, the explicit exception for the turk language looks arbitrary.
> > 
> > Care to add a bit of text?
> Quite a bit, very well, as attached.

Looks great.  Pushed.


Thanks,
Corinna
  
Brian Inglis Feb. 3, 2026, 6:08 a.m. UTC | #4
Hi Thomas,

Is upper casing with accents in fr-CA vs without in fr-FR handled?

On 2026-02-02 10:05, Thomas Wolff wrote:
> Am 02.02.2026 um 14:45 schrieb Corinna Vinschen:
>> Hi Thomas,
>> thanks for the patch.  Just one point:

>> This is a non-trivial patch, so it would be nice to have a non-trivial
>> commit message explaining what your doing and why.  To the uninformed
>> reader, the explicit exception for the turk language looks arbitrary.
>> Care to add a bit of text?

> Quite a bit, very well, as attached.
  
Corinna Vinschen Feb. 3, 2026, 12:21 p.m. UTC | #5
[IDK if Thomas is subscribed.  CC'ing him here...]

On Feb  2 23:08, Brian Inglis wrote:
> Hi Thomas,
> 
> Is upper casing with accents in fr-CA vs without in fr-FR handled?
> 
> On 2026-02-02 10:05, Thomas Wolff wrote:
> > Am 02.02.2026 um 14:45 schrieb Corinna Vinschen:
> > > Hi Thomas,
> > > thanks for the patch.  Just one point:
> 
> > > This is a non-trivial patch, so it would be nice to have a non-trivial
> > > commit message explaining what your doing and why.  To the uninformed
> > > reader, the explicit exception for the turk language looks arbitrary.
> > > Care to add a bit of text?
> 
> > Quite a bit, very well, as attached.
> -- 
> Take care. Thanks, Brian Inglis              Calgary, Alberta, Canada
> 
> La perfection est atteinte                   Perfection is achieved
> non pas lorsqu'il n'y a plus rien à ajouter  not when there is no more to add
> mais lorsqu'il n'y a plus rien à retrancher  but when there is no more to cut
>                                 -- Antoine de Saint-Exupéry
  

Patch

diff --git a/newlib/libc/ctype/towctrans.c b/newlib/libc/ctype/towctrans.c
index 176aa3d9d..2cd34184e 100644
--- a/newlib/libc/ctype/towctrans.c
+++ b/newlib/libc/ctype/towctrans.c
@@ -81,7 +81,7 @@  _towctrans_r (struct _reent *r,
 	wctrans_t w)
 {
   if (w == WCT_TOLOWER || w == WCT_TOUPPER)
-    return towctrans_l (c, w, 0);
+    return towctrans_l (c, w, LC_GLOBAL_LOCALE);
   else
     {
       // skipping this because it was causing trouble (cygwin crash)
diff --git a/newlib/libc/ctype/towctrans_l.c b/newlib/libc/ctype/towctrans_l.c
index e94d6f492..2b843f302 100644
--- a/newlib/libc/ctype/towctrans_l.c
+++ b/newlib/libc/ctype/towctrans_l.c
@@ -72,9 +72,21 @@  bisearch (wint_t ucs, const struct caseconv_entry *table, int max)
   return 0;
 }
 
+static int
+isturk (struct __locale_t *locale)
+{
+  const char * loc = getlocalename_l (LC_CTYPE, locale);
+  if (!loc)
+    return 0;
+  return 0 == strncmp (loc, "tr", 2) || 0 == strncmp (loc, "az", 2);
+}
+
 static wint_t
-toulower (wint_t c)
+toulower (wint_t c, struct __locale_t *locale)
 {
+  if (c == 'I' && isturk (locale))
+    return 0x131; // LATIN SMALL LETTER DOTLESS I
+
   const struct caseconv_entry * cce =
     bisearch(c, caseconv_table,
              sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
@@ -108,8 +120,11 @@  toulower (wint_t c)
 }
 
 static wint_t
-touupper (wint_t c)
+touupper (wint_t c, struct __locale_t *locale)
 {
+  if (c == 'i' && isturk (locale))
+    return 0x130; // LATIN CAPITAL LETTER I WITH DOT ABOVE
+
   const struct caseconv_entry * cce =
     bisearch(c, caseconv_table,
              sizeof(caseconv_table) / sizeof(*caseconv_table) - 1);
@@ -151,9 +166,9 @@  towctrans_l (wint_t c, wctrans_t w, struct __locale_t *locale)
   wint_t u = _jp2uc_l (c, locale);
   wint_t res;
   if (w == WCT_TOLOWER)
-    res = toulower (u);
+    res = toulower (u, locale);
   else if (w == WCT_TOUPPER)
-    res = touupper (u);
+    res = touupper (u, locale);
   else
     {
       // skipping the errno setting that was previously involved