Unicode 17.0 updates: build scripts and data tables

Message ID 2d898170-e83a-41bb-bc1a-9d1a0999a6a2@towo.net
State New
Headers
Series Unicode 17.0 updates: build scripts and data tables |

Commit Message

Thomas Wolff Dec. 20, 2025, 1:59 p.m. UTC
  Patch 0002 updates newlib data tables to Unicode 17.0.
Two of the build scripts needed an update first as the Unicode file 
layout slightly changed.
They were synchronized with the same respective scripts from mintty for 
this purpose. This is patch 0001.
Thomas
From c35d2803654e8c96ef0ef6d11d9b7a3604f3163f Mon Sep 17 00:00:00 2001
From: Thomas Wolff <towo@towo.net>
Date: Sat, 20 Dec 2025 00:00:00 +0000
Subject: [PATCH 1/2] Unicode table build: update scripts for generation of
 width data

needed to handle recent changes in Unicode.org data file layout
---
 newlib/libc/string/mkwide   | 32 ++++++++++++++++----------------
 newlib/libc/string/mkwidthA | 19 ++++++++++---------
 2 files changed, 26 insertions(+), 25 deletions(-)
  

Comments

Corinna Vinschen Dec. 22, 2025, 11:41 a.m. UTC | #1
On Dec 20 14:59, Thomas Wolff wrote:
> Patch 0002 updates newlib data tables to Unicode 17.0.
> Two of the build scripts needed an update first as the Unicode file layout
> slightly changed.
> They were synchronized with the same respective scripts from mintty for this
> purpose. This is patch 0001.
> Thomas

Thank you!  Pushed to main and the cygwin-3_6-branch.


Corinna
  

Patch

diff --git a/newlib/libc/string/mkwide b/newlib/libc/string/mkwide
index 55a0bab43..42f36507e 100755
--- a/newlib/libc/string/mkwide
+++ b/newlib/libc/string/mkwide
@@ -4,20 +4,24 @@ 
 
 skipcheck=false
 
-if [ ! -r EastAsianWidth.txt ]
-then	ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
+if $skipcheck || make Blocks.txt >&2
+then	true
+else	echo Could not acquire Unicode data file Blocks.txt >&2
+	exit 1
 fi
-if [ ! -r UnicodeData.txt ]
-then	ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
+if $skipcheck || make EastAsianWidth.txt >&2
+then	true
+else	echo Could not acquire Unicode data file EastAsianWidth.txt >&2
+	exit 1
 fi
-if [ ! -r Blocks.txt ]
-then	ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
+if $skipcheck || make UnicodeData.txt >&2
+then	true
+else	echo Could not acquire Unicode data file UnicodeData.txt >&2
+	exit 1
 fi
 
-sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na
-sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw
-
-PATH="$PATH:." # for uniset
+sed -e "s,^\([^;]*\) *; *[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na
+sed -e "s,^\([^;]*\) *; *[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw
 
 nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'`
 echo FW $nrfw
@@ -31,7 +35,7 @@  includes () {
 	nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'`
 	test $nr != $3
 }
-echo "adding compact closure of wide ranges, this may take ~10min"
+echo "adding compact closure of wide ranges, this may take a few minutes"
 for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt`
 do	range=$b
 	echo checking $range $* >&2
@@ -40,10 +44,6 @@  do	range=$b
 	fi
 done > wide.blocks
 
-(
-sed -e "s,^,//," -e 1q EastAsianWidth.txt
-sed -e "s,^,//," -e 1q Blocks.txt
-uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c
-) > wide.t
+uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c > wide.t
 
 rm -f wide.na wide.fw wide.blocks
diff --git a/newlib/libc/string/mkwidthA b/newlib/libc/string/mkwidthA
index 343ab4016..fc6f3d2bf 100755
--- a/newlib/libc/string/mkwidthA
+++ b/newlib/libc/string/mkwidthA
@@ -3,18 +3,19 @@ 
 # generate WIDTH-A file, listing Unicode characters with width property
 # Ambiguous, from EastAsianWidth.txt
 
-if [ ! -r EastAsianWidth.txt ]
-then	ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1
+if make EastAsianWidth.txt >&2
+then	true
+else	echo Could not acquire Unicode data file EastAsianWidth.txt >&2
+	exit 1
 fi
-if [ ! -r UnicodeData.txt ]
-then	ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1
-fi
-if [ ! -r Blocks.txt ]
-then	ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1
+if make UnicodeData.txt >&2
+then	true
+else	echo Could not acquire Unicode data file UnicodeData.txt >&2
+	exit 1
 fi
 
-sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new
+sed -e "s,^\([^;]*\) *; *A,\1," -e t -e d EastAsianWidth.txt > width-a-new
 rm -f WIDTH-A
 echo "# UAX #11: East Asian Ambiguous" > WIDTH-A
-PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A
+uniset +width-a-new compact >> WIDTH-A
 rm -f width-a-new