From 20a4dfca1c812492465f4cd6374c74a707305009 Mon Sep 17 00:00:00 2001
From: Eli Zaretskii <eliz@gnu.org>
Date: Sat, 3 Aug 2024 18:11:57 +0300
Subject: [PATCH] Improve font search and handling on MS-Windows

* src/w32font.c: Add commentary about font search on MS-Windows.
(w32font_coverage_ok, add_font_entity_to_list)
(font_supported_scripts): Consider the coverage OK if a font has
only the SIP bit set, but also sets relevant codepage bits in the
CSB bits.
(font_supported_scripts): Fix script for USB bit 99.
* src/font.c (font_parse_fcname, font_parse_family_registry)
[HAVE_NTGUI]: Don't consider hyphenated suffixes of some Windows
fonts as not belonging to the family name.
* src/w32uniscribe.c (uniscribe_check_otf_1): Increase tags[]
array size, to avoid the E_OUTOFMEMORY error for some fonts.

* lisp/international/fontset.el (font-encoding-alist): Add
'unicode-sip'.

(cherry picked from commit ff6954b9c833bfeb8032fb772fa08e60e9ec56a8)
---
 lisp/international/fontset.el |   1 +
 src/font.c                    |  42 ++++++++---
 src/w32font.c                 | 133 ++++++++++++++++++++++++++++++++--
 src/w32uniscribe.c            |   2 +-
 4 files changed, 160 insertions(+), 18 deletions(-)

diff --git a/lisp/international/fontset.el b/lisp/international/fontset.el
index 695c313cb26..c9b60418b22 100644
--- a/lisp/international/fontset.el
+++ b/lisp/international/fontset.el
@@ -88,6 +88,7 @@
 	("iso10646-1$" . (unicode-bmp . nil))
 	("iso10646.indian-1" . (unicode-bmp . nil))
 	("unicode-bmp" . (unicode-bmp . nil))
+        ("unicode-sip" . (unicode-sip . nil)) ; used by w32font.c
 	("abobe-symbol" . symbol)
 	("sisheng_cwnn" . chinese-sisheng)
 	("mulearabic-0" . arabic-digit)
diff --git a/src/font.c b/src/font.c
index 246fe1c4426..112618a7307 100644
--- a/src/font.c
+++ b/src/font.c
@@ -1627,15 +1627,30 @@ font_parse_fcname (char *name, ptrdiff_t len, Lisp_Object font)
 	{
 	  bool decimal = 0, size_found = 1;
 	  for (q = p + 1; *q && *q != ':'; q++)
-	    if (! c_isdigit (*q))
-	      {
-		if (*q != '.' || decimal)
-		  {
-		    size_found = 0;
-		    break;
-		  }
-		decimal = 1;
-	      }
+	    {
+#ifdef HAVE_NTGUI
+	      /* MS-Windows has several CJK fonts whose name ends in
+                 "-ExtB".  It also has fonts whose names end in "-R" or
+                 "-B", and one font whose name ends in "-SB".  */
+	      if (q == p + 1 && (strncmp (q, "ExtB", 4) == 0
+				 || strncmp (q, "R", 1) == 0
+				 || strncmp (q, "B", 1) == 0
+				 || strncmp (q, "SB", 2) == 0))
+		{
+		  size_found = 0;
+		  break;
+		}
+#endif
+	      if (! c_isdigit (*q))
+		{
+		  if (*q != '.' || decimal)
+		    {
+		      size_found = 0;
+		      break;
+		    }
+		  decimal = 1;
+		}
+	    }
 	  if (size_found)
 	    {
 	      family_end = p;
@@ -2000,6 +2015,15 @@ font_parse_family_registry (Lisp_Object family, Lisp_Object registry, Lisp_Objec
       len = SBYTES (family);
       p0 = SSDATA (family);
       p1 = strchr (p0, '-');
+#ifdef HAVE_NTGUI
+      /* MS-Windows has fonts whose family name ends in "-ExtB" and
+         other suffixes which include a hyphen.  */
+      if (p1 && (strcmp (p1, "-ExtB") == 0
+		 || strcmp (p1, "-R") == 0
+		 || strcmp (p1, "-B") == 0
+		 || strcmp (p1, "-SB") == 0))
+	p1 = NULL;
+#endif
       if (p1)
 	{
 	  if ((*p0 != '*' && p1 - p0 > 0)
diff --git a/src/w32font.c b/src/w32font.c
index ccbd3837afb..efb42d80336 100644
--- a/src/w32font.c
+++ b/src/w32font.c
@@ -809,6 +809,93 @@ w32font_otf_drive (struct font *font, Lisp_Object features,
                    bool alternate_subst);
   */
 
+/* Notes about the way fonts are found on MS-Windows when we have a
+   character unsupported by the default font.
+
+   Since we don't use Fontconfig on MS-Windows, we cannot efficiently
+   search for fonts which support certain characters, because Windows
+   doesn't store this information anywhere, and we can only know whether
+   a font supports some character if we actually open the font, which is
+   expensive and slow.  Instead, we rely on font information Windows
+   exposes to the API we use to enumerate available fonts,
+   EnumFontFamiliesEx.  This information includes two bitmapped attributes:
+
+     USB (which stands for Unicode Subset Bitfields) -- this is an array
+         of 4 32-bit values, 128 bits in total, where each bit
+         corresponds to some block (sometimes several related blocks) of
+         Unicode codepoints which the font claims to support.
+     CSB (which stands for Codepage Bitfields) -- this is an array of 2
+	 32-bit values (64 bits), where each bit corresponds to some
+	 codepage whose characters the font claims to support.
+
+   When Emacs needs to find a font for a character, it enumerates the
+   available fonts, filtering the fonts by examining these bitmaps and a
+   few other font attributes.  The script of the character is converted
+   to the corresponding bits in USB, and a font that has any of these
+   bits set is deemed as a candidate; see font_supported_scripts, which
+   is called by font_matches_spec.  The problem with this strategy is
+   twofold:
+
+    - Some Unicode blocks have no USB bits.  For the scripts
+      corresponding to those blocks we use a small cache of fonts known
+      to support those script.  This cache is calculated once, and needs
+      not be recalculated as long as no fonts are installed or deleted
+      (it can be saved in your init file and reused for the following
+      sessions).  See the function w32-find-non-USB-fonts.  Note that
+      for that function to work well, 'script-representative-chars'
+      should include the important characters for each script which has
+      no USB bits defined.
+
+    - Some fonts claim support for a block, but don't support it well.
+      Other fonts support some blocks very well, but don't set the
+      corresponding USB bits for the blocks.  For these we use some
+      heuristics:
+
+      . For few fonts that claim coverage, but don't provide it, we
+	either recognize them by name and reject their false claims, or
+	let users set face-ignored-fonts to ignore those fonts.
+
+      . For fonts that support some blocks very well, but don't set
+	their USB bits, we examine the CSB bits instead.  This is
+	particularly important for some CJK fonts with good support in
+	the SIP area: they only set the SIP bit (bit 57) in the USB.  We
+	consider those as candidates for CJK scripts ('han', 'kana',
+	etc.) if the CSB bits are set for the corresponding CJK
+	codepages.
+
+   Eventually, some characters could still appear as "tofu" (a box with
+   the character's hex codepoint), even though a font might be available
+   on the system which supports the character.  This is because the
+   above strategy, with all its heuristics and tricks, sometimes fails.
+   For example, it could fail if the system has several fonts installed
+   whose coverage of some blocks is incomplete -- Emacs could select
+   such a font based on its USB bits, and realize the font has no glyph
+   for a character only when it's too late.  This happens because when
+   several fonts claim coverage of the same Unicode block, Emacs on
+   Windows has no way of preferring one over the other, if they all
+   support the same values of size, weight, and slant.  So Emacs usually
+   selects the first such candidate, which could lack glyphs for the
+   characters Emacs needs to display.  Since we avoid naming non-free
+   Windows fonts in Emacs's sources, this cannot be fixed in the the
+   default fontset setup provided by Emacs: we cannot arrange for the
+   "good" fonts to be used in all such cases, because that would mean
+   naming those fonts.  The solution for thes issues is to customize the
+   default fontset using set-fontset-font, to force Emacs to use a font
+   known to support some characters.
+
+   One other Windows-specific issue is the fact that some Windows fonts
+   have hyphens in their names.  Emacs generally follows the XLFD
+   specifications, where a hyphen is used as separator between segments
+   of a font spec.  There are few places in the code in font.c where
+   Emacs handles such font names specially, and it currently knows about
+   font names documented for Windows versions up to and including 11.
+   See this page for the latest update:
+
+     https://learn.microsoft.com/en-us/typography/fonts/windows_11_font_list
+
+   If more fonts are added to Windows that have hyphens in their names,
+   the code in font.c will need to be updated.  */
+
 /* Internal implementation of w32font_list.
    Additional parameter opentype_only restricts the returned fonts to
    opentype fonts, which can be used with the Uniscribe backend.  */
@@ -1455,22 +1542,34 @@ static int
 w32font_coverage_ok (FONTSIGNATURE * coverage, BYTE charset)
 {
   DWORD subrange1 = coverage->fsUsb[1];
+  DWORD codepages0 = coverage->fsCsb[0];
 
 #define SUBRANGE1_HAN_MASK 0x08000000
 #define SUBRANGE1_HANGEUL_MASK 0x01000000
 #define SUBRANGE1_JAPANESE_MASK (0x00060000 | SUBRANGE1_HAN_MASK)
+#define SUBRANGE1_SIP_MASK 0x02000000
 
+/* We consider the coverage to be OK if either (a) subrange1 has the
+   bits set that correspond to CHARSET, or (b) subrange1 indicates SIP
+   support and codepages0 has one or more bits set corresponding to
+   CHARSET.  */
   if (charset == GB2312_CHARSET || charset == CHINESEBIG5_CHARSET)
     {
-      return (subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK;
+      return ((subrange1 & SUBRANGE1_HAN_MASK) == SUBRANGE1_HAN_MASK
+	      || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+		  && (codepages0 & CSB_CHINESE) != 0));
     }
   else if (charset == SHIFTJIS_CHARSET)
     {
-      return (subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK;
+      return ((subrange1 & SUBRANGE1_JAPANESE_MASK) == SUBRANGE1_JAPANESE_MASK
+	      || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+		  && (codepages0 & CSB_JAPANESE) != 0));
     }
   else if (charset == HANGEUL_CHARSET)
     {
-      return (subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK;
+      return ((subrange1 & SUBRANGE1_HANGEUL_MASK) == SUBRANGE1_HANGEUL_MASK
+	      || ((subrange1 & SUBRANGE1_SIP_MASK) != 0
+		  && (codepages0 & CSB_KOREAN) != 0));
     }
 
   return 1;
@@ -1620,11 +1719,18 @@ add_font_entity_to_list (ENUMLOGFONTEX *logical_font,
 	}
       /* unicode-sip fonts must contain characters in Unicode plane 2.
 	 so look for bit 57 (surrogates) in the Unicode subranges, plus
-	 the bits for CJK ranges that include those characters.  */
+	 the bits for CJK ranges that include those characters or CJK
+	 bits in code-page bit fields..  */
       else if (EQ (spec_charset, Qunicode_sip))
 	{
-	  if (!(physical_font->ntmFontSig.fsUsb[1] & 0x02000000)
-	      || !(physical_font->ntmFontSig.fsUsb[1] & 0x28000000))
+	  if (!((physical_font->ntmFontSig.fsUsb[1] & 0x02000000)
+		&& ((physical_font->ntmFontSig.fsUsb[1] & 0x28000000)
+		    /* Some CJK fonts with very good coverage of SIP
+                       characters have only the 0x02000000 bit in USB
+                       set, so we allow them if their code-page bits
+                       indicate support for CJK character sets.  */
+		    || (physical_font->ntmFontSig.fsCsb[0]
+			& (CSB_CHINESE | CSB_JAPANESE | CSB_KOREAN)))))
 	    return 1;
 	}
 
@@ -2328,7 +2434,18 @@ font_supported_scripts (FONTSIGNATURE * sig)
   SUBRANGE (53, Qphags_pa);
   /* 54: Enclosed CJK letters and months, 55: CJK Compatibility.  */
   SUBRANGE (56, Qhangul);
-  /* 57: Surrogates.  */
+  /* 57: Non-BMP.  Processed specially: Several fonts that support CJK
+     Ideographs Extensions and other extensions, set just this bit and
+     Latin, and nothing else.  */
+  if (subranges[57 / 32] & (1U << (57 % 32)))
+    {
+      if ((sig->fsCsb[0] & CSB_CHINESE))
+	supported = Fcons (Qhan, supported);
+      if ((sig->fsCsb[0] & CSB_JAPANESE))
+	supported = Fcons (Qkana, supported);
+      if ((sig->fsCsb[0] & CSB_KOREAN))
+	supported = Fcons (Qhangul, supported);
+    }
   SUBRANGE (58, Qphoenician);
   SUBRANGE (59, Qhan); /* There are others, but this is the main one.  */
   SUBRANGE (59, Qideographic_description); /* Windows lumps this in.  */
@@ -2385,7 +2502,7 @@ font_supported_scripts (FONTSIGNATURE * sig)
   SUBRANGE (97, Qglagolitic);
   SUBRANGE (98, Qtifinagh);
   /* 99: Yijing Hexagrams.  */
-  SUBRANGE (99, Qhan);
+  SUBRANGE (99, Qcjk_misc);
   SUBRANGE (100, Qsyloti_nagri);
   SUBRANGE (101, Qlinear_b);
   SUBRANGE (101, Qaegean_number);
diff --git a/src/w32uniscribe.c b/src/w32uniscribe.c
index 471bdf544d8..751963705d2 100644
--- a/src/w32uniscribe.c
+++ b/src/w32uniscribe.c
@@ -895,7 +895,7 @@ uniscribe_check_otf_1 (HDC context, Lisp_Object script, Lisp_Object lang,
 		       Lisp_Object features[2], int *retval)
 {
   SCRIPT_CACHE cache = NULL;
-  OPENTYPE_TAG tags[32], script_tag, lang_tag;
+  OPENTYPE_TAG tags[128], script_tag, lang_tag;
   int max_tags = ARRAYELTS (tags);
   int ntags, i, ret = 0;
   HRESULT rslt;
-- 
2.39.5