From: Eli Zaretskii <eliz@gnu.org>
Date: Sat, 16 Mar 2019 11:59:03 +0000 (+0200)
Subject: Improve locale and language environment setting at startup
X-Git-Tag: emacs-27.0.90~3405
X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=34dd4e0a83e19882f61c9a2ac99ecc12632d13d4;p=emacs.git

Improve locale and language environment setting at startup

* lisp/international/mule-cmds.el (locale-language-names): Add
more locales and their language environments.
(set-locale-environment): Use w32-multibyte-code-page, if
non-zero, as locale-coding-system.  (Bug#34684)

* src/w32fns.c (globals_of_w32fns) <w32-multibyte-code-page>:
New variable.

* etc/NEWS: Mention w32-multibyte-code-page.
---

diff --git a/etc/NEWS b/etc/NEWS
index 000d211c1ab..f25c3f5dc3d 100644
--- a/etc/NEWS
+++ b/etc/NEWS
@@ -1736,6 +1736,14 @@ versions of MS-Windows.  Set this variable to 50 if for some reason
 you need the old behavior (and please report such situations to Emacs
 developers).
 
+---
+** New variable 'w32-multibyte-code-page'.
+This variable holds the value of the multibyte code page used by the
+system.  It is usually zero, which indicates that 'w32-ansi-code-page'
+is being used, except in Far Eastern locales.  When this variable is
+non-zero, Emacs at startup sets 'locale-coding-system' to the
+corresponding encoding, instead of using 'w32-ansi-code-page'.
+
 +++
 ** On NS the behaviour of drag and drop can now be modified by use of
 modifier keys in line with Apples guidelines.  This makes the drag and
diff --git a/lisp/international/mule-cmds.el b/lisp/international/mule-cmds.el
index 3c1769a02fb..5f87d899415 100644
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@@ -2181,22 +2181,27 @@ See `set-language-info-alist' for use in programs."
 (defconst locale-language-names
   (purecopy
    '(
-    ;; Locale names of the form LANGUAGE[_TERRITORY][.CODESET][@MODIFIER]
-    ;; as specified in the Single Unix Spec, Version 2.
-    ;; LANGUAGE is a language code taken from ISO 639:1988 (E/F)
-    ;; with additions from ISO 639/RA Newsletter No.1/1989;
-    ;; see Internet RFC 2165 (1997-06) and
-    ;; http://www.evertype.com/standards/iso639/iso639-en.html
-    ;; TERRITORY is a country code taken from ISO 3166
-    ;; http://www.din.de/gremien/nas/nabd/iso3166ma/codlstp1/en_listp1.html.
-    ;; CODESET and MODIFIER are implementation-dependent.
+     ;; Locale names of the form LANGUAGE[_TERRITORY][.CODESET][@MODIFIER]
+     ;; as specified in the Single Unix Spec, Version 2.
+     ;; LANGUAGE is a language code taken from ISO 639:1988 (E/F)
+     ;; with additions from ISO 639/RA Newsletter No.1/1989;
+     ;; see Internet RFC 2165 (1997-06) and
+     ;; http://www.evertype.com/standards/iso639/iso639-en.html
+     ;; TERRITORY is a country code taken from ISO 3166
+     ;; http://www.din.de/gremien/nas/nabd/iso3166ma/codlstp1/en_listp1.html.
+     ;; CODESET and MODIFIER are implementation-dependent.
+
+     ;; Language names for which there are no locales (yet) are
+     ;; commented out.
 
      ;; jasonr comments: MS Windows uses three letter codes for
      ;; languages instead of the two letter ISO codes that POSIX
-     ;; uses. In most cases the first two letters are the same, so
-     ;; most of the regexps in locale-language-names work. Japanese
-     ;; and Chinese are exceptions, which are listed in the
-     ;; non-standard section at the bottom of locale-language-names.
+     ;; uses.  In most cases the first two letters are the same, so
+     ;; most of the regexps in locale-language-names work.  Japanese,
+     ;; Chinese, and some others are exceptions, which are listed in the
+     ;; non-standard section at the bottom of locale-language-names, or
+     ;; in the main section, if otherwise we would pick up the wrong
+     ;; entry (because the first matching entry is used).
 
     ("aa_DJ" . "Latin-1") ; Afar
     ("aa" . "UTF-8")
@@ -2204,11 +2209,12 @@ See `set-language-info-alist' for use in programs."
     ("af" . "Latin-1") ; Afrikaans
     ("am" "Ethiopic" utf-8) ; Amharic
     ("an" . "Latin-9") ; Aragonese
+    ("arn" . "UTF-8") ; MS-Windows Mapudungun, Mapuche
     ("ar" . "Arabic")
-    ; as Assamese
+    ("as" . "UTF-8") ; Assamese
     ; ay Aymara
     ("az" . "UTF-8") ; Azerbaijani
-    ; ba Bashkir
+    ("ba" . "UTF-8") ; Bashkir, Cyrillic script
     ("be" "Belarusian" cp1251) ; Belarusian [Byelorussian until early 1990s]
     ("bg" "Bulgarian" cp1251) ; Bulgarian
     ; bh Bihari
@@ -2219,12 +2225,12 @@ See `set-language-info-alist' for use in programs."
     ("bs" . "Latin-2") ; Bosnian
     ("byn" . "UTF-8")  ; Bilin; Blin
     ("ca" "Catalan" iso-8859-1) ; Catalan
-    ; co Corsican
+    ("co" . "UTF-8") ; Corsican
     ("cs" "Czech" iso-8859-2)
     ("cy" "Welsh" iso-8859-14)
     ("da" . "Latin-1") ; Danish
     ("de" "German" iso-8859-1)
-    ; dv Divehi
+    ("dv" . "UTF-8") ; Divehi
     ; dz Bhutani
     ("ee" . "Latin-4") ; Ewe
     ("el" "Greek" iso-8859-7)
@@ -2238,6 +2244,8 @@ See `set-language-info-alist' for use in programs."
     ("et" . "Latin-9") ; Estonian
     ("eu" . "Latin-1") ; Basque
     ("fa" "Persian" utf-8) ; Persian
+    ("fil" . "UTF-8") ; Filipino
+    ("fpo" . "UTF-8") ; MS-Windows Filipino
     ("fi" . "Latin-9") ; Finnish
     ("fj" . "Latin-1") ; Fiji
     ("fo" . "Latin-1") ; Faroese
@@ -2246,6 +2254,7 @@ See `set-language-info-alist' for use in programs."
     ("ga" . "Latin-1") ; Irish Gaelic (new orthography)
     ("gd" . "Latin-9") ; Scots Gaelic
     ("gez" "Ethiopic" utf-8) ; Geez
+    ("gla" . "Latin-9") ; MS-Windows Scots Gaelic
     ("gl" . "Latin-1") ; Gallegan; Galician
     ; gn Guarani
     ("gu" "Gujarati" utf-8) ; Gujarati
@@ -2256,27 +2265,33 @@ See `set-language-info-alist' for use in programs."
     ("hni_IN" . "UTF-8") ; Chhattisgarhi
     ("hr" "Croatian" iso-8859-2) ; Croatian
     ("hu" . "Latin-2") ; Hungarian
-    ; hy Armenian
+    ("hy" . "UTF-8") ;  Armenian
     ; ia Interlingua
     ("id" . "Latin-1") ; Indonesian
     ; ie Interlingue
-    ; ik Inupiak
+    ("ig" . "UTF-8") ; Igbo (Nigeria)
+    ("ibo" . "UTF-8") ; MS-Windows Igbo
+    ; ik Inupiak, Inupiaq
     ("is" . "Latin-1") ; Icelandic
     ("it" "Italian" iso-8859-1) ; Italian
     ; iu Inuktitut
     ("iw" "Hebrew" iso-8859-8)
     ("ja" "Japanese" euc-jp)
     ; jw Javanese
+    ("kal" . "Latin-1") ; MS-Windows Greenlandic
     ("ka" "Georgian" georgian-ps) ; Georgian
-    ; kk Kazakh
+    ("kk" . "UTF-8") ; Kazakh
     ("kl" . "Latin-1") ; Greenlandic
     ("km" "Khmer" utf-8) ; Cambodian, Khmer
+    ("knk" "Devanagari" utf-8) ; MS-Windows Konkani
+    ("kok" "Devanagari" utf-8) ; Konkani
     ("kn" "Kannada" utf-8)
     ("ko" "Korean" euc-kr)
     ("ks" . "UTF-8") ; Kashmiri
     ; ku Kurdish
     ("kw" . "Latin-1") ; Cornish
     ("ky" . "UTF-8") ; Kirghiz
+    ("lao" "Lao" utf-8) ; MS-Windows Lao
     ("la" . "Latin-1") ; Latin
     ("lb" . "Latin-1") ; Luxemburgish
     ("lg" . "Latin-6") ; Ganda, a.k.a. Luganda
@@ -2287,18 +2302,22 @@ See `set-language-info-alist' for use in programs."
     ; mg Malagasy
     ("mi" . "Latin-7") ; Maori
     ("mk" "Cyrillic-ISO" iso-8859-5) ; Macedonian
+    ("mlt" . "Latin-3") ; MS-Windows Maltese
     ("ml" "Malayalam" utf-8)
     ("mn" . "UTF-8") ; Mongolian
-    ; mo Moldavian
+    ; mo Moldavian (retired)
+    ("mri" . "Latin-7") ; MS-Windows Maori
     ("mr" "Devanagari" utf-8) ; Marathi
     ("ms" . "Latin-1") ; Malay
     ("mt" . "Latin-3") ; Maltese
+    ("mym" "Malayalam" utf-8) ; MS-Windows Malayalam
     ("my" "Burmese" utf-8) ; Burmese
     ; na Nauru
     ("nb" . "Latin-1") ; Norwegian
     ("ne" "Devanagari" utf-8) ; Nepali
     ("nl" "Dutch" iso-8859-1)
     ("nn" . "Latin-1") ; Norwegian Nynorsk
+    ("non" . "Latin-1") ; MS-Windows Norwegian Nynorsk
     ("no" . "Latin-1") ; Norwegian
     ("nr_ZA" . "UTF-8") ; South Ndebele
     ("nso_ZA" . "UTF-8") ; Pedi
@@ -2308,7 +2327,8 @@ See `set-language-info-alist' for use in programs."
     ("or" "Oriya" utf-8)
     ("pa" "Punjabi" utf-8) ; Punjabi
     ("pl" "Polish" iso-8859-2) ; Polish
-    ; ps Pashto, Pushto
+    ("ps" . "UTF-8") ; Pashto, Pushto
+    ("pas" . "UTF-8") ; MS-Windows Pashto
     ("pt_BR" "Brazilian Portuguese" iso-8859-1) ; Brazilian Portuguese
     ("pt" . "Latin-1") ; Portuguese
     ; qu Quechua
@@ -2318,7 +2338,7 @@ See `set-language-info-alist' for use in programs."
     ("ru_RU.koi8r" "Cyrillic-KOI8" koi8-r)
     ("ru_RU" "Russian" iso-8859-5)
     ("ru_UA" "Russian" koi8-u)
-    ; rw Kinyarwanda
+    ("rw" . "UTF-8") ; Kinyarwanda
     ("sa" . "Devanagari") ; Sanskrit
     ; sd Sindhi
     ("se" . "UTF-8") ; Northern Sami
@@ -2339,6 +2359,7 @@ See `set-language-info-alist' for use in programs."
     ; su Sundanese
     ("sv" "Swedish" iso-8859-1)		; Swedish
     ("sw" . "Latin-1") ; Swahili
+    ("taj" "Tajik" koi8-t) ; MS-Windows Tajik w/Cyrillic script
     ("ta" "Tamil" utf-8)
     ("te" "Telugu" utf-8) ; Telugu
     ("tg" "Tajik" koi8-t)
@@ -2348,15 +2369,17 @@ See `set-language-info-alist' for use in programs."
     ("th" "Thai" iso-8859-11)
     ("ti" "Ethiopic" utf-8) ; Tigrinya
     ("tig_ER" . "UTF-8") ; Tigre
-    ; tk Turkmen
+    ("tk" . "Latin-5") ; Turkmen
+    ("tuk" . "Latin-5") ; MS-Windows Turkmen
     ("tl" . "Latin-1") ; Tagalog
     ("tn" . "Latin-9") ; Setswana, Tswana
     ; to Tonga
     ("tr" "Turkish" iso-8859-9)
+    ("tsn" . "Latin-9") ; MS-Windows Tswana
     ("ts" . "Latin-1") ; Tsonga
     ("tt" . "UTF-8") ; Tatar
     ; tw Twi
-    ; ug Uighur
+    ("ug" . "UTF-8") ; Uighur
     ("uk" "Ukrainian" koi8-u)
     ("ur" . "UTF-8") ; Urdu
     ("uz_UZ@cyrillic" . "UTF-8"); Uzbek
@@ -2365,10 +2388,10 @@ See `set-language-info-alist' for use in programs."
     ("vi" "Vietnamese" utf-8)
     ; vo Volapuk
     ("wa" . "Latin-1") ; Walloon
-    ; wo Wolof
+    ("wo" . "UTF-8") ; Wolof
     ("xh" . "Latin-1") ; Xhosa
     ("yi" . "Windows-1255") ; Yiddish
-    ; yo Yoruba
+    ("yo" . "UTF-8") ; Yoruba
     ; za Zhuang
     ("zh_HK" . "Chinese-Big5")
     ; zh_HK/BIG5-HKSCS \
@@ -2378,6 +2401,9 @@ See `set-language-info-alist' for use in programs."
     ("zh_CN.GB18030" "Chinese-GB18030")
     ("zh_CN.UTF-8" . "Chinese-GBK")
     ("zh_CN" . "Chinese-GB")
+    ("zhh" . "Chinese-Big5") ; MS-Windows Chinese (Hong Kong S.A.R.)
+    ("zhi" . "Chinese-GBK") ; MS-Windows Chinese (Singapore)
+    ("zhm" . "Chinese-Big5") ; MS-Windows Chinese (Macao S.A.R.)
     ("zh" . "Chinese-GB")
     ("zu" . "Latin-1") ; Zulu
 
@@ -2395,12 +2421,23 @@ See `set-language-info-alist' for use in programs."
     ("sp" . "Cyrillic-ISO") ; Serbian (Cyrillic alphabet), e.g. X11R6.4
     ("su" . "Latin-1") ; Finnish, e.g. Solaris 2.6
     ("jp" . "Japanese") ; e.g. MS Windows
-    ("chs" . "Chinese-GBK") ; MS Windows Chinese Simplified
-    ("cht" . "Chinese-BIG5") ; MS Windows Chinese Traditional
+    ("chs" . "Chinese-GBK") ; MS Windows Chinese Simplified (PRC)
+    ("cht" . "Chinese-BIG5") ; MS Windows Chinese Traditional (Taiwan)
     ("gbz" . "UTF-8") ; MS Windows Dari Persian
     ("div" . "UTF-8") ; MS Windows Divehi (Maldives)
     ("wee" . "Latin-2") ; MS Windows Lower Sorbian
     ("wen" . "Latin-2") ; MS Windows Upper Sorbian
+    ("ind" . "Latin-1") ; MS-Windows Indonesian
+    ("sme" . "UTF-8") ; MS-Windows Northern Sami (Norway)
+    ("smf" . "UTF-8") ; MS-Windows Northern Sami (Sweden)
+    ("smg" . "ITF-8") ; MS-Windows Northern Sami (Finland)
+    ("kdi" "Kannada" utf-8) ; MS-Windows Kannada
+    ("mar" "Devanagari" utf-8) ; MS-Windows Marathi
+    ("khm" "Khmer" utf-8) ; MS-Windows Khmer
+    ("iri" . "Latin-1") ; MS-Windows Irish Gaelic
+    ; mwk  MS-Windows Mohawk (Canada)
+    ("uig" . "UTF-8") ; MS-Windows Uighur
+    ("kin" . "UTF-8") ;  MS-Windows Kinyarwanda
     ))
   "Alist of locale regexps vs the corresponding languages and coding systems.
 Each element has this form:
@@ -2702,10 +2739,20 @@ See also `locale-charset-language-names', `locale-language-names',
              (output-coding
               (if noninteractive
                   (intern (format "cp%d" (w32-get-console-output-codepage)))
-                code-page-coding)))
-	(when (coding-system-p code-page-coding)
+                code-page-coding))
+             (multibyte-code-page-coding
+              (or (and (boundp 'w32-multibyte-code-page)
+                       (not (zerop w32-multibyte-code-page))
+                       (intern (format "cp%d" w32-multibyte-code-page)))
+                  code-page-coding))
+             (locale-coding
+              (if noninteractive
+                  code-page-coding
+                multibyte-code-page-coding)))
+	(when (and (coding-system-p code-page-coding)
+                   (coding-system-p locale-coding))
           (or output-coding (setq output-coding code-page-coding))
-	  (unless frame (setq locale-coding-system code-page-coding))
+	  (unless frame (setq locale-coding-system locale-coding))
 	  (set-keyboard-coding-system code-page-coding frame)
 	  (set-terminal-coding-system output-coding frame)
 	  (setq default-file-name-coding-system ansi-code-page-coding))))
diff --git a/src/w32fns.c b/src/w32fns.c
index 4f53d93d8b4..1fbf32760dd 100644
--- a/src/w32fns.c
+++ b/src/w32fns.c
@@ -48,6 +48,7 @@ along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
 
 #ifdef WINDOWSNT
 #include <mbstring.h>
+#include <mbctype.h>	/* for _getmbcp */
 #endif /* WINDOWSNT */
 
 #if CYGWIN
@@ -10908,6 +10909,15 @@ globals_of_w32fns (void)
 	      doc: /* The ANSI code page used by the system.  */);
   w32_ansi_code_page = GetACP ();
 
+#ifndef CYGWIN
+  DEFVAR_INT ("w32-multibyte-code-page",
+	      w32_multibyte_code_page,
+	      doc: /* The current multibyte code page used by the system.
+A value of zero indicates that the single-byte code page is in use,
+see `w32-ansi-code-page'.  */);
+  w32_multibyte_code_page = _getmbcp ();
+#endif
+
   if (os_subtype == OS_NT)
     w32_unicode_gui = 1;
   else