From f73e9da343cd48a7ab094a1a5b2ab19ae8fb08d9 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Sun, 13 May 2018 14:15:46 -0700 Subject: [PROPOSED 3/4] Stabilize rule name abbreviations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem reported by Tom Lane in: https://mm.icann.org/pipermail/tz/2018-May/026469.html Instead of Lane’s simple proposal, use a more-complex hash that shortens the overall output of zishrink.awk and generates output that is easier for humans to remember. * NEWS: Mention this. * zishrink.awk (record_hash, prehash_rule_names): New functions. (gen_rule_name): New arg NAME. All uses changed. Use a simple mnemonic: the first two letters. Check for collisions by calling record_hash. (BEGIN): Initialize hash table. --- NEWS | 3 ++ zishrink.awk | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 142 insertions(+), 20 deletions(-) diff --git a/NEWS b/NEWS index 56adc24..a91f00a 100644 --- a/NEWS +++ b/NEWS @@ -32,6 +32,9 @@ Unreleased, experimental changes if you want to build the rearguard tarball. (Problem reported by Deborah Goldsmith.) + tzdata.zi is now more stable from release to release. (Problem + noted by Tom Lane.) It is also a bit shorter. + Release 2018e - 2018-05-01 23:42:51 -0700 diff --git a/zishrink.awk b/zishrink.awk index d617644..21c71c0 100644 --- a/zishrink.awk +++ b/zishrink.awk @@ -6,28 +6,146 @@ # 'zic' should treat this script's output as if it were identical to # this script's input. +# Record a hash N for the new name NAME, checking for collisions. -# Return a new rule name. -# N_RULE_NAMES keeps track of how many rule names have been generated. +function record_hash(n, name) +{ + if (used_hashes[n]) { + printf "# ! collision: %s %s\n", used_hashes[n], name + exit 1 + } + used_hashes[n] = name +} + +# Return a shortened rule name representing NAME, +# and record this relationship to the hash table. + +function gen_rule_name(name, n) +{ + # Use a simple memonic: the first two letters. + n = substr(name, 1, 2) + record_hash(n, name) + # printf "# %s = %s\n", n, name + return n +} -function gen_rule_name(alphabet, base, rule_name, n, digit) +function prehash_rule_names(name) { - alphabet = "" - alphabet = alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - alphabet = alphabet "abcdefghijklmnopqrstuvwxyz" - alphabet = alphabet "!$%&'()*+,./:;<=>?@[\\]^_`{|}~" - base = length(alphabet) - rule_name = "" - n = n_rule_names++ - - do { - n -= rule_name && n <= base - digit = n % base - rule_name = substr(alphabet, digit + 1, 1) rule_name - n = (n - digit) / base - } while (n); - - return rule_name + # Rule names are not part of the tzdb API, so substitute shorter + # ones. Shortening them consistently from one release to the next + # simplifies comparison of the output. That being said, the + # 1-letter names below are not standardized in any way, and can + # change arbitrarily from one release to the next, as the main goal + # here is compression not comparison. + + # Abbreviating these rules names to one letter saved the most space + # circa 2018e. + rule["Arg"] = "A" + rule["Brazil"] = "B" + rule["Canada"] = "C" + rule["Denmark"] = "D" + rule["EU"] = "E" + rule["France"] = "F" + rule["GB-Eire"] = "G" + rule["Halifax"] = "H" + rule["Italy"] = "I" + rule["Jordan"] = "J" + rule["Egypt"] = "K" # "Kemet" in ancient Egyptian + rule["Libya"] = "L" + rule["Morocco"] = "M" + rule["Neth"] = "N" + rule["Poland"] = "O" # arbitrary + rule["Palestine"] = "P" + rule["Cuba"] = "Q" # Its start sounds like "Q". + rule["Russia"] = "R" + rule["Syria"] = "S" + rule["Turkey"] = "T" + rule["Uruguay"] = "U" + rule["Vincennes"] = "V" + rule["Winn"] = "W" + rule["Mongol"] = "X" # arbitrary + rule["NT_YK"] = "Y" + rule["Zion"] = "Z" + rule["Austria"] = "a" + rule["Belgium"] = "b" + rule["C-Eur"] = "c" + rule["Algeria"] = "d" # country code DZ + rule["E-Eur"] = "e" + rule["Taiwan"] = "f" # Formosa + rule["Greece"] = "g" + rule["Hungary"] = "h" + rule["Iran"] = "i" + rule["StJohns"] = "j" + rule["Chatham"] = "k" # arbitrary + rule["Lebanon"] = "l" + rule["Mexico"] = "m" + rule["Tunisia"] = "n" # country code TN + rule["Moncton"] = "o" # arbitrary + rule["Port"] = "p" + rule["Albania"] = "q" + rule["Regina"] = "r" + rule["Spain"] = "s" + rule["Toronto"] = "t" + rule["US"] = "u" + rule["Louisville"] = "v" # ville + rule["Iceland"] = "w" # arbitrary + rule["Chile"] = "x" # arbitrary + rule["Para"] = "y" # country code PY + rule["Romania"] = "z" # arbitrary + rule["Macau"] = "_" # arbitrary + + # Use ISO 3166 alpha-2 country codes for remaining names that are countries. + # This is more systematic, and avoids collisions (e.g., Malta and Moldova). + rule["Armenia"] = "AM" + rule["Aus"] = "AU" + rule["Azer"] = "AZ" + rule["Barb"] = "BB" + rule["Dhaka"] = "BD" + rule["Bulg"] = "BG" + rule["Bahamas"] = "BS" + rule["Belize"] = "BZ" + rule["Swiss"] = "CH" + rule["Cook"] = "CK" + rule["PRC"] = "CN" + rule["Cyprus"] = "CY" + rule["Czech"] = "CZ" + rule["Germany"] = "DE" + rule["DR"] = "DO" + rule["Ecuador"] = "EC" + rule["Finland"] = "FI" + rule["Fiji"] = "FJ" + rule["Falk"] = "FK" + rule["Ghana"] = "GH" + rule["Guat"] = "GT" + rule["Hond"] = "HN" + rule["Haiti"] = "HT" + rule["Eire"] = "IE" + rule["Iraq"] = "IQ" + rule["Japan"] = "JP" + rule["Kyrgyz"] = "KG" + rule["ROK"] = "KR" + rule["Latvia"] = "LV" + rule["Lux"] = "LX" + rule["Moldova"] = "MD" + rule["Malta"] = "MT" + rule["Mauritius"] = "MU" + rule["Namibia"] = "NA" + rule["Nic"] = "NI" + rule["Norway"] = "NO" + rule["Peru"] = "PE" + rule["Phil"] = "PH" + rule["Pakistan"] = "PK" + rule["Sudan"] = "SD" + rule["Salv"] = "SV" + rule["Tonga"] = "TO" + rule["Vanuatu"] = "VU" + + # Avoid collisions. + rule["Detroit"] = "Dt" # De = Denver + + for (name in rule) { + record_hash(rule[name], name) + } } # Process an input line and save it for later output. @@ -106,7 +224,7 @@ function process_input_line(line, field, end, i, n, startdef) i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2 if (i && field[i] ~ /^[^-+0-9]/) { if (!rule[field[i]]) - rule[field[i]] = gen_rule_name() + rule[field[i]] = gen_rule_name(field[i]) field[i] = rule[field[i]] } @@ -146,6 +264,7 @@ function output_saved_lines(i) BEGIN { print "# version", version print "# This zic input file is in the public domain." + prehash_rule_names() } /^[\t ]*[^#\t ]/ { -- 2.7.4