From f73e9da343cd48a7ab094a1a5b2ab19ae8fb08d9 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sun, 13 May 2018 14:15:46 -0700
Subject: [PROPOSED 3/4] Stabilize rule name abbreviations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem reported by Tom Lane in:
https://mm.icann.org/pipermail/tz/2018-May/026469.html
Instead of Lane’s simple proposal, use a more-complex hash that
shortens the overall output of zishrink.awk and generates output
that is easier for humans to remember.
* NEWS: Mention this.
* zishrink.awk (record_hash, prehash_rule_names): New functions.
(gen_rule_name): New arg NAME.  All uses changed.
Use a simple mnemonic: the first two letters.
Check for collisions by calling record_hash.
(BEGIN): Initialize hash table.
---
 NEWS         |   3 ++
 zishrink.awk | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 142 insertions(+), 20 deletions(-)

diff --git a/NEWS b/NEWS
index 56adc24..a91f00a 100644
--- a/NEWS
+++ b/NEWS
@@ -32,6 +32,9 @@ Unreleased, experimental changes
     if you want to build the rearguard tarball.  (Problem reported by
     Deborah Goldsmith.)
 
+    tzdata.zi is now more stable from release to release.  (Problem
+    noted by Tom Lane.)  It is also a bit shorter.
+
 
 Release 2018e - 2018-05-01 23:42:51 -0700
 
diff --git a/zishrink.awk b/zishrink.awk
index d617644..21c71c0 100644
--- a/zishrink.awk
+++ b/zishrink.awk
@@ -6,28 +6,146 @@
 # 'zic' should treat this script's output as if it were identical to
 # this script's input.
 
+# Record a hash N for the new name NAME, checking for collisions.
 
-# Return a new rule name.
-# N_RULE_NAMES keeps track of how many rule names have been generated.
+function record_hash(n, name)
+{
+  if (used_hashes[n]) {
+    printf "# ! collision: %s %s\n", used_hashes[n], name
+    exit 1
+  }
+  used_hashes[n] = name
+}
+
+# Return a shortened rule name representing NAME,
+# and record this relationship to the hash table.
+
+function gen_rule_name(name, n)
+{
+  # Use a simple memonic: the first two letters.
+  n = substr(name, 1, 2)
+  record_hash(n, name)
+  # printf "# %s = %s\n", n, name
+  return n
+}
 
-function gen_rule_name(alphabet, base, rule_name, n, digit)
+function prehash_rule_names(name)
 {
-  alphabet = ""
-  alphabet = alphabet "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-  alphabet = alphabet "abcdefghijklmnopqrstuvwxyz"
-  alphabet = alphabet "!$%&'()*+,./:;<=>?@[\\]^_`{|}~"
-  base = length(alphabet)
-  rule_name = ""
-  n = n_rule_names++
-
-  do {
-    n -= rule_name && n <= base
-    digit = n % base
-    rule_name = substr(alphabet, digit + 1, 1) rule_name
-    n = (n - digit) / base
-  } while (n);
-
-  return rule_name
+  # Rule names are not part of the tzdb API, so substitute shorter
+  # ones.  Shortening them consistently from one release to the next
+  # simplifies comparison of the output.  That being said, the
+  # 1-letter names below are not standardized in any way, and can
+  # change arbitrarily from one release to the next, as the main goal
+  # here is compression not comparison.
+
+  # Abbreviating these rules names to one letter saved the most space
+  # circa 2018e.
+  rule["Arg"] = "A"
+  rule["Brazil"] = "B"
+  rule["Canada"] = "C"
+  rule["Denmark"] = "D"
+  rule["EU"] = "E"
+  rule["France"] = "F"
+  rule["GB-Eire"] = "G"
+  rule["Halifax"] = "H"
+  rule["Italy"] = "I"
+  rule["Jordan"] = "J"
+  rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
+  rule["Libya"] = "L"
+  rule["Morocco"] = "M"
+  rule["Neth"] = "N"
+  rule["Poland"] = "O" # arbitrary
+  rule["Palestine"] = "P"
+  rule["Cuba"] = "Q" # Its start sounds like "Q".
+  rule["Russia"] = "R"
+  rule["Syria"] = "S"
+  rule["Turkey"] = "T"
+  rule["Uruguay"] = "U"
+  rule["Vincennes"] = "V"
+  rule["Winn"] = "W"
+  rule["Mongol"] = "X" # arbitrary
+  rule["NT_YK"] = "Y"
+  rule["Zion"] = "Z"
+  rule["Austria"] = "a"
+  rule["Belgium"] = "b"
+  rule["C-Eur"] = "c"
+  rule["Algeria"] = "d" # country code DZ
+  rule["E-Eur"] = "e"
+  rule["Taiwan"] = "f" # Formosa
+  rule["Greece"] = "g"
+  rule["Hungary"] = "h"
+  rule["Iran"] = "i"
+  rule["StJohns"] = "j"
+  rule["Chatham"] = "k" # arbitrary
+  rule["Lebanon"] = "l"
+  rule["Mexico"] = "m"
+  rule["Tunisia"] = "n" # country code TN
+  rule["Moncton"] = "o" # arbitrary
+  rule["Port"] = "p"
+  rule["Albania"] = "q"
+  rule["Regina"] = "r"
+  rule["Spain"] = "s"
+  rule["Toronto"] = "t"
+  rule["US"] = "u"
+  rule["Louisville"] = "v" # ville
+  rule["Iceland"] = "w" # arbitrary
+  rule["Chile"] = "x" # arbitrary
+  rule["Para"] = "y" # country code PY
+  rule["Romania"] = "z" # arbitrary
+  rule["Macau"] = "_" # arbitrary
+
+  # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
+  # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
+  rule["Armenia"] = "AM"
+  rule["Aus"] = "AU"
+  rule["Azer"] = "AZ"
+  rule["Barb"] = "BB"
+  rule["Dhaka"] = "BD"
+  rule["Bulg"] = "BG"
+  rule["Bahamas"] = "BS"
+  rule["Belize"] = "BZ"
+  rule["Swiss"] = "CH"
+  rule["Cook"] = "CK"
+  rule["PRC"] = "CN"
+  rule["Cyprus"] = "CY"
+  rule["Czech"] = "CZ"
+  rule["Germany"] = "DE"
+  rule["DR"] = "DO"
+  rule["Ecuador"] = "EC"
+  rule["Finland"] = "FI"
+  rule["Fiji"] = "FJ"
+  rule["Falk"] = "FK"
+  rule["Ghana"] = "GH"
+  rule["Guat"] = "GT"
+  rule["Hond"] = "HN"
+  rule["Haiti"] = "HT"
+  rule["Eire"] = "IE"
+  rule["Iraq"] = "IQ"
+  rule["Japan"] = "JP"
+  rule["Kyrgyz"] = "KG"
+  rule["ROK"] = "KR"
+  rule["Latvia"] = "LV"
+  rule["Lux"] = "LX"
+  rule["Moldova"] = "MD"
+  rule["Malta"] = "MT"
+  rule["Mauritius"] = "MU"
+  rule["Namibia"] = "NA"
+  rule["Nic"] = "NI"
+  rule["Norway"] = "NO"
+  rule["Peru"] = "PE"
+  rule["Phil"] = "PH"
+  rule["Pakistan"] = "PK"
+  rule["Sudan"] = "SD"
+  rule["Salv"] = "SV"
+  rule["Tonga"] = "TO"
+  rule["Vanuatu"] = "VU"
+
+  # Avoid collisions.
+  rule["Detroit"] = "Dt" # De = Denver
+
+  for (name in rule) {
+    record_hash(rule[name], name)
+  }
 }
 
 # Process an input line and save it for later output.
@@ -106,7 +224,7 @@ function process_input_line(line, field, end, i, n, startdef)
   i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
   if (i && field[i] ~ /^[^-+0-9]/) {
     if (!rule[field[i]])
-      rule[field[i]] = gen_rule_name()
+      rule[field[i]] = gen_rule_name(field[i])
     field[i] = rule[field[i]]
   }
 
@@ -146,6 +264,7 @@ function output_saved_lines(i)
 BEGIN {
   print "# version", version
   print "# This zic input file is in the public domain."
+  prehash_rule_names()
 }
 
 /^[\t ]*[^#\t ]/ {
-- 
2.7.4