[PROPOSED] New file tzdata.zi for text-format tzdata
* NEWS: Document this. * Makefile (TZDATA_TEXT, TZDATA_ZI_DEPS): New macros. (TABDATA): Use them. (DATA): Redo to not include tzdata.zi, since it is distributed as part of tzdb, not tzdata. (AWK_SCRIPTS): Add zishrink.awk. (ENCHILADA): Add tzdata.zi. (install): Install tzdata.zi and leapseconds, if requested by TZDATA_TEXT. (tzdata.zi): New rule. (install_data, $(TZS_NEW), zonenames): Use tzdata.zi instead of its inputs. (check_character_set, check_links): Check tzdata.zi. (clean): Remove tzdata.zi. (set-timestamps.out): Set tzdata.zi’s time stamp. (check_public): Check that tzdata.zi can be compiled. * zishrink.awk: New file. --- Makefile | 58 +++++++++++++++++++++++++------------- NEWS | 19 ++++++++++--- zishrink.awk | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 23 deletions(-) create mode 100644 zishrink.awk diff --git a/Makefile b/Makefile index 6373920..e037b7c 100644 --- a/Makefile +++ b/Makefile @@ -92,6 +92,17 @@ LIBDIR= $(TOPDIR)/lib REDO= posix_right +# To install data in text form that has all the information of the binary data, +# (optionally incorporating leap second information), use +# TZDATA_TEXT= tzdata.zi leapseconds +# To install text data without leap second information (e.g., because +# REDO='posix_only'), use +# TZDATA_TEXT= tzdata.zi +# To avoid installing text data, use +# TZDATA_TEXT= + +TZDATA_TEXT= leapseconds tzdata.zi + # For backward-compatibility links for old zone names, use # BACKWARD= backward pacificnew # To omit these links, use @@ -395,18 +406,19 @@ YDATA= $(PRIMARY_YDATA) etcetera $(BACKWARD) NDATA= systemv factory TDATA= $(YDATA) $(NDATA) ZONETABLES= zone1970.tab zone.tab -TABDATA= iso3166.tab leapseconds $(ZONETABLES) +TABDATA= iso3166.tab $(TZDATA_TEXT) $(ZONETABLES) LEAP_DEPS= leapseconds.awk leap-seconds.list -DATA= $(YDATA) $(NDATA) backzone $(TABDATA) \ - leap-seconds.list yearistype.sh -AWK_SCRIPTS= checklinks.awk checktab.awk leapseconds.awk +TZDATA_ZI_DEPS= zishrink.awk $(TDATA) $(PACKRATDATA) +DATA= $(YDATA) $(NDATA) backzone iso3166.tab leap-seconds.list \ + leapseconds yearistype.sh $(ZONETABLES) +AWK_SCRIPTS= checklinks.awk checktab.awk leapseconds.awk zishrink.awk MISC= $(AWK_SCRIPTS) zoneinfo2tdf.pl TZS_YEAR= 2050 TZS= to$(TZS_YEAR).tzs TZS_NEW= to$(TZS_YEAR)new.tzs TZS_DEPS= $(PRIMARY_YDATA) asctime.c localtime.c \ private.h tzfile.h zdump.c zic.c -ENCHILADA= $(COMMON) $(DOCS) $(SOURCES) $(DATA) $(MISC) $(TZS) +ENCHILADA= $(COMMON) $(DOCS) $(SOURCES) $(DATA) $(MISC) $(TZS) tzdata.zi # Consult these files when deciding whether to rebuild the 'version' file. # This list is not the same as the output of 'git ls-files', since @@ -443,7 +455,7 @@ install: all $(DATA) $(REDO) $(MANS) $(DESTDIR)$(MANDIR)/man3 $(DESTDIR)$(MANDIR)/man5 \ $(DESTDIR)$(MANDIR)/man8 $(ZIC_INSTALL) -l $(LOCALTIME) -p $(POSIXRULES) - cp -f iso3166.tab $(ZONETABLES) $(DESTDIR)$(TZDIR)/. + cp -f $(TABDATA) $(DESTDIR)$(TZDIR)/. cp tzselect zic zdump $(DESTDIR)$(ETCDIR)/. cp libtz.a $(DESTDIR)$(LIBDIR)/. $(RANLIB) $(DESTDIR)$(LIBDIR)/libtz.a @@ -464,6 +476,13 @@ version: $(VERSION_DEPS) printf '%s\n' "$$V" >$@.out mv $@.out $@ +# This file can be tailored by setting BACKWARD, PACKRATDATA, etc. +tzdata.zi: $(TZDATA_ZI_DEPS) + $(AWK) -v PACKRATDATA='$(PACKRATDATA)' \ + -f zishrink.awk \ + $(TDATA) $(PACKRATDATA) >$@.out + mv $@.out $@ + version.h: version VERSION=`cat version` && printf '%s\n' \ 'static char const PKGVERSION[]="($(PACKAGE)) ";' \ @@ -498,10 +517,8 @@ INSTALLARGS = \ ZIC='$(ZIC)' # 'make install_data' installs one set of tz binary files. -# It can be tailored by setting LEAPSECONDS, PACKRATDATA, etc. -install_data: zic leapseconds yearistype $(PACKRATDATA) $(TDATA) - $(ZIC_INSTALL) $(TDATA) - $(AWK) '/^Rule/' $(TDATA) | $(ZIC_INSTALL) - $(PACKRATDATA) +install_data: zic leapseconds yearistype tzdata.zi + $(ZIC_INSTALL) tzdata.zi posix_only: $(MAKE) $(INSTALLARGS) LEAPSECONDS= install_data @@ -538,14 +555,14 @@ posix_packrat: zones: $(REDO) -$(TZS_NEW): $(TDATA) zdump zic +$(TZS_NEW): tzdata.zi zdump zic mkdir -p tzs.dir - $(zic) -d tzs.dir $(TDATA) + $(zic) -d tzs.dir tzdata.zi $(AWK) '/^Link/{print $$1 "\t" $$2 "\t" $$3}' \ - $(TDATA) | LC_ALL=C sort >$@.out + tzdata.zi | LC_ALL=C sort >$@.out wd=`pwd` && \ zones=`$(AWK) -v wd="$$wd" \ - '/^Zone/{print wd "/tzs.dir/" $$2}' $(TDATA) \ + '/^Zone/{print wd "/tzs.dir/" $$2}' tzdata.zi \ | LC_ALL=C sort` && \ ./zdump -i -c $(TZS_YEAR) $$zones >>$@.out sed 's,^TZ=".*tzs\.dir/,TZ=",' $@.out >$@.sed.out @@ -589,7 +606,8 @@ check_character_set: $(ENCHILADA) sharp='#' && \ ! grep -Env $(SAFE_LINE) $(MANS) date.1 $(MANTXTS) \ $(MISC) $(SOURCES) $(WEB_PAGES) \ - CONTRIBUTING LICENSE Makefile README version && \ + CONTRIBUTING LICENSE Makefile README \ + version tzdata.ic && \ ! grep -Env $(SAFE_SHARP_LINE) $(TDATA) backzone \ leapseconds yearistype.sh zone.tab && \ ! grep -Env $(OK_LINE) $(ENCHILADA) @@ -613,6 +631,7 @@ check_sorted: backward backzone iso3166.tab zone.tab zone1970.tab check_links: checklinks.awk $(TDATA) $(AWK) -f checklinks.awk $(TDATA) + $(AWK) -f checklinks.awk tzdata.zi check_tables: checktab.awk $(PRIMARY_YDATA) $(ZONETABLES) for tab in $(ZONETABLES); do \ @@ -630,7 +649,7 @@ clean_misc: rm -f core *.o *.out \ date tzselect version.h zdump zic yearistype libtz.a clean: clean_misc - rm -fr *.dir tzdb-*/ $(TZS_NEW) + rm -fr *.dir tzdata.zi tzdb-*/ $(TZS_NEW) maintainer-clean: clean @echo 'This command is intended for maintainers to use; it' @@ -684,6 +703,7 @@ set-timestamps.out: $(ENCHILADA) touch -cmr `ls -t $$file workman.sh | sed 1q` $$file.txt || \ exit; \ done + touch -cmr `ls -t $(TZDATA_ZI_DEPS) | sed 1q` tzdata.zi touch -cmr `ls -t $(TZS_DEPS) | sed 1q` $(TZS) touch -cmr `ls -t $(VERSION_DEPS) | sed 1q` version touch $@ @@ -695,7 +715,7 @@ check_public: $(MAKE) maintainer-clean $(MAKE) "CFLAGS=$(GCC_DEBUG_FLAGS)" ALL mkdir -p public.dir - for i in $(TDATA) ; do \ + for i in $(TDATA) tzdata.zi; do \ $(zic) -v -d public.dir $$i 2>&1 || exit; \ done $(zic) -v -d public.dir $(TDATA) @@ -789,8 +809,8 @@ typecheck: $(MAKE) clean ; \ done -zonenames: $(TDATA) - @$(AWK) '/^Zone/ { print $$2 } /^Link/ { print $$3 }' $(TDATA) +zonenames: tzdata.zi + @$(AWK) '/^Zone/ { print $$2 } /^Link/ { print $$3 }' tzdata.zi asctime.o: private.h tzfile.h date.o: private.h diff --git a/NEWS b/NEWS index d22d561..8c81caa 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,21 @@ Unreleased, experimental changes Add 7 s to the UT offset in Asia/Yangon before 1920. + Changes to build procedure + + To support applications that prefer to read time zone data in text + form, two zic input files tzdata.zi and leapseconds are now + installed by default. The commands 'zic tzdata.zi' and 'zic -L + leapseconds tzdata.zi' can reproduce the tzdata binary files + without and with leap seconds, respectively. To prevent these two + new files from being installed, use 'make TZDATA_TEXT=', and to + suppress leap seconds from the tzdata text installation, use 'make + TZDATA_TEXT=tzdata.zi'. + + 'make BACKWARD=' now suppresses backward-compatibility names + like 'US/Pacific' that are defined in the 'backward' and + 'pacificnew' files. + Changes to code zic and the reference runtime now reject multiple leap seconds @@ -27,10 +42,6 @@ Unreleased, experimental changes Also, zic warns about the undocumented usage with a "last-" prefix, e.g., "last-Fri". - 'make BACKWARD=' now suppresses backward-compatibility names - like 'US/Pacific' that are defined in the 'backward' and - 'pacificnew' files. - Several minor changes have been made to the code to make it a bit easier to port to MS-Windows. (Thanks to Kees Dekker for reporting the problems.) diff --git a/zishrink.awk b/zishrink.awk new file mode 100644 index 0000000..42240f1 --- /dev/null +++ b/zishrink.awk @@ -0,0 +1,91 @@ +# Convert tzdata source into a smaller version of itself. + +# Contributed by Paul Eggert. This file is in the public domain. + +# This is not a general-purpose converter; it is designed for current tzdata. +# 'zic' should treat this script's output as if it were identical to +# this script's input. + +BEGIN { + print "# This zic input file is in the public domain." + if (PACKRATDATA) { + while (0 < (getline line <PACKRATDATA)) { + if (split(line, field)) { + if (field[1] == "Zone") packrat_zone[field[2]] = 1 + if (field[1] == "Link") packrat_zone[field[3]] = 1 + } + } + close (PACKRATDATA) + } +} + +# Remove comments, normalize spaces, and append a space to each line. +/^[[:space:]]*[^#[:space:]]/ { + line = $0 + sub(/#.*/, "", line) + line = line " " + gsub(/[[:space:]]+/, " ", line) + + # SystemV rules are not needed. + if (line ~ /^Rule SystemV /) next + + # Replace FooAsia rules with the same rules without "Asia", as they + # are duplicates. + if (n = match(line, /[^ ]Asia /)) { + if (line ~ /^Rule /) next + line = substr(line, 1, n) substr(line, n + 5) + } + + # Abbreviate times. + while (n = match(line, /[: ]0+[0-9]/)) { + line = substr(line, 1, n) substr(line, n + RLENGTH - 1) + } + while (n = match(line, /:0[^:]/)) { + line = substr(line, 1, n - 1) substr(line, n + 2) + } + + # Abbreviate weekday names. Do not abbreviate "Sun" and "Sat", as + # pre-2017c zic erroneously diagnoses "Su" and "Sa" as ambiguous. + while (n = match(line, /[ l]Mon[<>]/)) { + line = substr(line, 1, n + 1) substr(line, n + 4) + } + while (n = match(line, /[ l]Tue[<>]/)) { + line = substr(line, 1, n + 2) substr(line, n + 4) + } + while (n = match(line, /[ l]Wed[<>]/)) { + line = substr(line, 1, n + 1) substr(line, n + 4) + } + while (n = match(line, /[ l]Thu[<>]/)) { + line = substr(line, 1, n + 2) substr(line, n + 4) + } + while (n = match(line, /[ l]Fri[<>]/)) { + line = substr(line, 1, n + 1) substr(line, n + 4) + } + + # Abbreviate "only" and month names. + gsub(/ only /, " o ", line) + gsub(/ Jan /, " Ja ", line) + gsub(/ Feb /, " F ", line) + gsub(/ Apr /, " Ap ", line) + gsub(/ Aug /, " Au ", line) + gsub(/ Sep /, " S ", line) + gsub(/ Oct /, " O ", line) + gsub(/ Nov /, " N ", line) + gsub(/ Dec /, " D ", line) + + # Strip leading and trailing space. + sub(/^ /, "", line) + sub(/ $/, "", line) + + # Remove unnecessary trailing zero fields. + sub(/ 0+$/, "", line) + + # Output lines unless they are later overridden in PACKRATDATA. + if (FILENAME != PACKRATDATA && line ~ /^[LZ]/) { + overridden = 0 + split(line, field) + overridden = packrat_zone[field[2 + (field[1] == "Link")]] + } + if (!overridden) + print line +} -- 2.9.4
Is that awk or gawk? I don't recognize some parts of it, and I would consider myself an awk expert. If it's gawk, please explicitly say so. -- Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095 <merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/> Perl/Unix consulting, Technical writing, Comedy, etc. etc. Still trying to think of something clever for the fourth line of this .sig
On 05/24/2017 05:04 PM, Randal L. Schwartz wrote:
Is that awk or gawk?
It's supposed to be portable Awk code, as per the POSIX standard and without using Gawk extensions; see: http://pubs.opengroup.org/onlinepubs/9699919799/utilities/awk.html Although the new script does not work with circa-1977 Awk, the existing tzdb code doesn't work either: for example, Makefile uses 'awk -v', which old Awks don't grok. If you're building tzdb on Solaris where the default awk is quite old, you should use 'make AWK=gawk' or (for Solaris 10 and earlier) 'make AWK=/usr/xpg4/bin/awk' to get a POSIX-compatible Awk; this is needed both before and after the proposed patch. As I recall, Solaris is the only major holdout in this area. If the new script introduces a new portability problem with Awk, I would like to know about it. By the way, the proposed patch misses a few abbreviation possibilities in the part of the code that works around a word-abbreviation bug in zic 2017b and earlier. The attached further patch fixes that, to make the output a tiny bit smaller.
"Paul" == Paul Eggert <eggert@cs.ucla.edu> writes:
Paul> It's supposed to be portable Awk code, as per the POSIX standard Paul> and without using Gawk extensions; Thank you. I was only suspicious, but after reviewing your support material, I can see you're well in charge of this. I apologize if I've wasted anyone's time. -- Randal L. Schwartz - Stonehenge Consulting Services, Inc. - +1 503 777 0095 <merlyn@stonehenge.com> <URL:http://www.stonehenge.com/merlyn/> Perl/Unix consulting, Technical writing, Comedy, etc. etc. Still trying to think of something clever for the fourth line of this .sig
On 05/24/2017 05:49 PM, Paul Eggert wrote:
the proposed patch misses a few abbreviation possibilities
It also had some bugs and infelicities caught by further testing, and fixed in the attached patches. At this point it's good enough to be put into the main experimental version on GitHub, so you can pick up the fully-patched version there <https://github.com/eggert/tz>. The new file tzdata.zi is currently 123,621 bytes of text data, which shrinks to 22,254 bytes when compressed via 'lzip -9'. This is a significant saving over the 669,396 (compressing to 227,548) bytes in the traditional data source files, or the 460,823 (compressing to 196,210) bytes in the installed binary data files.
participants (2)
-
merlyn@stonehenge.com -
Paul Eggert