From e0470bcec76b94f93aed796abdcab14e2086fffc Mon Sep 17 00:00:00 2001
From: Andrew G Cohen <cohen@andy.bu.edu>
Date: Thu, 10 Mar 2022 09:30:00 +0800
Subject: [PATCH] Replace list and vector sorting with TIMSORT algorithm

* src/Makefile.in (base_obj): Add sort.o.
* src/deps.mk (fns.o): Add sort.c.
* src/lisp.h: Add prototypes for inorder, tim_sort.
* src/sort.c: New file providing tim_sort.
* src/fns.c:  Remove prototypes for removed routines.
(merge_vectors, sort_vector_inplace, sort_vector_copy): Remove.
(sort_list, sort_vector): Use tim_sort.
---
 src/Makefile.in |   2 +-
 src/deps.mk     |   2 +-
 src/fns.c       | 122 ++----
 src/lisp.h      |   4 +
 src/sort.c      | 961 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 993 insertions(+), 98 deletions(-)
 create mode 100644 src/sort.c

diff --git a/src/Makefile.in b/src/Makefile.in
index 3353fb16d79..e0f18dc352e 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -427,7 +427,7 @@ base_obj = dispnew.o frame.o scroll.o xdisp.o menu.o $(XMENU_OBJ) window.o \
 	minibuf.o fileio.o dired.o \
 	cmds.o casetab.o casefiddle.o indent.o search.o regex-emacs.o undo.o \
 	alloc.o pdumper.o data.o doc.o editfns.o callint.o \
-	eval.o floatfns.o fns.o font.o print.o lread.o $(MODULES_OBJ) \
+	eval.o floatfns.o fns.o sort.o font.o print.o lread.o $(MODULES_OBJ) \
 	syntax.o $(UNEXEC_OBJ) bytecode.o comp.o $(DYNLIB_OBJ) \
 	process.o gnutls.o callproc.o \
 	region-cache.o sound.o timefns.o atimer.o \
diff --git a/src/deps.mk b/src/deps.mk
index deffab93eca..39edd5c1dd3 100644
--- a/src/deps.mk
+++ b/src/deps.mk
@@ -279,7 +279,7 @@ eval.o: eval.c commands.h keyboard.h blockinput.h atimer.h systime.h frame.h \
    dispextern.h lisp.h globals.h $(config_h) coding.h composite.h xterm.h \
    msdos.h
 floatfns.o: floatfns.c syssignal.h lisp.h globals.h $(config_h)
-fns.o: fns.c commands.h lisp.h $(config_h) frame.h buffer.h character.h \
+fns.o: fns.c sort.c commands.h lisp.h $(config_h) frame.h buffer.h character.h \
    keyboard.h keymap.h window.h $(INTERVALS_H) coding.h ../lib/md5.h \
    ../lib/sha1.h ../lib/sha256.h ../lib/sha512.h blockinput.h atimer.h \
    systime.h xterm.h ../lib/unistd.h globals.h
diff --git a/src/fns.c b/src/fns.c
index 06a64563806..2e8454532cb 100644
--- a/src/fns.c
+++ b/src/fns.c
@@ -39,9 +39,6 @@ along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
 #include "puresize.h"
 #include "gnutls.h"
 
-static void sort_vector_copy (Lisp_Object pred, ptrdiff_t len,
-			      Lisp_Object src[restrict VLA_ELEMS (len)],
-			      Lisp_Object dest[restrict VLA_ELEMS (len)]);
 enum equal_kind { EQUAL_NO_QUIT, EQUAL_PLAIN, EQUAL_INCLUDING_PROPERTIES };
 static bool internal_equal (Lisp_Object, Lisp_Object,
 			    enum equal_kind, int, Lisp_Object);
@@ -2166,8 +2163,10 @@ See also the function `nreverse', which is used more often.  */)
   return new;
 }
 
-/* Sort LIST using PREDICATE, preserving original order of elements
-   considered as equal.  */
+
+/* Stably sort LIST using PREDICATE. This converts the list to a
+   vector, sorts the vector using the TIMSORT algorithm, and converts
+   back to a list.  */
 
 static Lisp_Object
 sort_list (Lisp_Object list, Lisp_Object predicate)
@@ -2175,97 +2174,34 @@ sort_list (Lisp_Object list, Lisp_Object predicate)
   ptrdiff_t length = list_length (list);
   if (length < 2)
     return list;
-
-  Lisp_Object tem = Fnthcdr (make_fixnum (length / 2 - 1), list);
-  Lisp_Object back = Fcdr (tem);
-  Fsetcdr (tem, Qnil);
-
-  return merge (Fsort (list, predicate), Fsort (back, predicate), predicate);
-}
-
-/* Using PRED to compare, return whether A and B are in order.
-   Compare stably when A appeared before B in the input.  */
-static bool
-inorder (Lisp_Object pred, Lisp_Object a, Lisp_Object b)
-{
-  return NILP (call2 (pred, b, a));
-}
-
-/* Using PRED to compare, merge from ALEN-length A and BLEN-length B
-   into DEST.  Argument arrays must be nonempty and must not overlap,
-   except that B might be the last part of DEST.  */
-static void
-merge_vectors (Lisp_Object pred,
-	       ptrdiff_t alen, Lisp_Object const a[restrict VLA_ELEMS (alen)],
-	       ptrdiff_t blen, Lisp_Object const b[VLA_ELEMS (blen)],
-	       Lisp_Object dest[VLA_ELEMS (alen + blen)])
-{
-  eassume (0 < alen && 0 < blen);
-  Lisp_Object const *alim = a + alen;
-  Lisp_Object const *blim = b + blen;
-
-  while (true)
+  else
     {
-      if (inorder (pred, a[0], b[0]))
+      Lisp_Object *result;
+      USE_SAFE_ALLOCA;
+      SAFE_ALLOCA_LISP (result, length);
+      Lisp_Object tail = list;
+      for (ptrdiff_t i = 0; i < length; i++)
 	{
-	  *dest++ = *a++;
-	  if (a == alim)
-	    {
-	      if (dest != b)
-		memcpy (dest, b, (blim - b) * sizeof *dest);
-	      return;
-	    }
+	  result[i] = Fcar (tail);
+	  tail = XCDR (tail);
 	}
-      else
+      tim_sort (predicate, result, length);
+
+      ptrdiff_t i = 0;
+      tail = list;
+      while (CONSP (tail))
 	{
-	  *dest++ = *b++;
-	  if (b == blim)
-	    {
-	      memcpy (dest, a, (alim - a) * sizeof *dest);
-	      return;
-	    }
+	  XSETCAR (tail, result[i]);
+	  tail = XCDR (tail);
+	  i++;
 	}
+      SAFE_FREE ();
+      return list;
     }
 }
 
-/* Using PRED to compare, sort LEN-length VEC in place, using TMP for
-   temporary storage.  LEN must be at least 2.  */
-static void
-sort_vector_inplace (Lisp_Object pred, ptrdiff_t len,
-		     Lisp_Object vec[restrict VLA_ELEMS (len)],
-		     Lisp_Object tmp[restrict VLA_ELEMS (len >> 1)])
-{
-  eassume (2 <= len);
-  ptrdiff_t halflen = len >> 1;
-  sort_vector_copy (pred, halflen, vec, tmp);
-  if (1 < len - halflen)
-    sort_vector_inplace (pred, len - halflen, vec + halflen, vec);
-  merge_vectors (pred, halflen, tmp, len - halflen, vec + halflen, vec);
-}
-
-/* Using PRED to compare, sort from LEN-length SRC into DST.
-   Len must be positive.  */
-static void
-sort_vector_copy (Lisp_Object pred, ptrdiff_t len,
-		  Lisp_Object src[restrict VLA_ELEMS (len)],
-		  Lisp_Object dest[restrict VLA_ELEMS (len)])
-{
-  eassume (0 < len);
-  ptrdiff_t halflen = len >> 1;
-  if (halflen < 1)
-    dest[0] = src[0];
-  else
-    {
-      if (1 < halflen)
-	sort_vector_inplace (pred, halflen, src, dest);
-      if (1 < len - halflen)
-	sort_vector_inplace (pred, len - halflen, src + halflen, dest);
-      merge_vectors (pred, halflen, src, len - halflen, src + halflen, dest);
-    }
-}
-
-/* Sort VECTOR in place using PREDICATE, preserving original order of
-   elements considered as equal.  */
+/* Stably sort VECTOR ordered by PREDICATE using the TIMSORT
+   algorithm.  */
 
 static void
 sort_vector (Lisp_Object vector, Lisp_Object predicate)
@@ -2273,14 +2209,8 @@ sort_vector (Lisp_Object vector, Lisp_Object predicate)
   ptrdiff_t len = ASIZE (vector);
   if (len < 2)
     return;
-  ptrdiff_t halflen = len >> 1;
-  Lisp_Object *tmp;
-  USE_SAFE_ALLOCA;
-  SAFE_ALLOCA_LISP (tmp, halflen);
-  for (ptrdiff_t i = 0; i < halflen; i++)
-    tmp[i] = make_fixnum (0);
-  sort_vector_inplace (predicate, len, XVECTOR (vector)->contents, tmp);
-  SAFE_FREE ();
+
+  tim_sort (predicate, XVECTOR (vector)->contents, len);
 }
 
 DEFUN ("sort", Fsort, Ssort, 2, 2, 0,
diff --git a/src/lisp.h b/src/lisp.h
index 315fb03fe63..43a4589efff 100644
--- a/src/lisp.h
+++ b/src/lisp.h
@@ -3903,6 +3903,10 @@ extern Lisp_Object string_to_multibyte (Lisp_Object);
 extern Lisp_Object string_make_unibyte (Lisp_Object);
 extern void syms_of_fns (void);
 
+/* Defined in sort.c  */
+extern void tim_sort (Lisp_Object, Lisp_Object *, const ptrdiff_t);
+extern bool inorder (Lisp_Object, Lisp_Object, Lisp_Object);
+
 /* Defined in floatfns.c.  */
 verify (FLT_RADIX == 2 || FLT_RADIX == 16);
 enum { LOG2_FLT_RADIX = FLT_RADIX == 2 ? 1 : 4 };
diff --git a/src/sort.c b/src/sort.c
new file mode 100644
index 00000000000..33f5d033190
--- /dev/null
+++ b/src/sort.c
@@ -0,0 +1,961 @@
+/* Timsort for sequences.
+
+Copyright (C) 2022 Free Software Foundation, Inc.
+
+This file is part of GNU Emacs.
+
+GNU Emacs is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+GNU Emacs is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* This is a version of the cpython code implementing the TIMSORT
+   sorting algorithm described in
+   https://github.com/python/cpython/blob/main/Objects/listsort.txt.
+   This algorithm identifies and pushes naturally ordered sublists of
+   the original list, or "runs", onto a stack, and merges them
+   periodically according to a merge strategy called "powersort".
+   State is maintained during the sort in a merge_state structure,
+   which is passed around as an argument to all the subroutines.  A
+   "stretch" structure includes a pointer to the run BASE of length
+   LEN along with its POWER (a computed integer used by the powersort
+   merge strategy that depends on this run and the succeeding run.  */
+
+
+#include <config.h>
+#include "lisp.h"
+
+
+/* MAX_MERGE_PENDING is the maximum number of entries in merge_state's
+   pending-stretch stack.  For a list with n elements, this needs at most
+   floor(log2(n)) + 1 entries even if we didn't force runs to a
+   minimal length.  So the number of bits in a ptrdiff_t is plenty large
+   enough for all cases.  */
+
+#define MAX_MERGE_PENDING (sizeof (ptrdiff_t)  * 8)
+
+/* Once we get into galloping mode, we stay there as long as both runs
+   win at least GALLOP_WIN_MIN consecutive times.  */
+
+#define GALLOP_WIN_MIN 7
+
+/* A small temp array of size MERGESTATE_TEMP_SIZE is used to avoid
+   malloc when merging small lists.  */
+
+#define MERGESTATE_TEMP_SIZE 256
+
+struct stretch
+{
+  Lisp_Object *base;
+  ptrdiff_t len;
+  int power;
+};
+
+struct reloc
+{
+  Lisp_Object **src;
+  Lisp_Object **dst;
+  ptrdiff_t *size;
+  int order;
+};
+
+
+typedef struct
+{
+  Lisp_Object *listbase;
+  ptrdiff_t listlen;
+
+  /* PENDING is a stack of N pending stretches yet to be merged.
+     Stretch #i starts at address base[i] and extends for len[i]
+     elements.  */
+
+  int n;
+  struct stretch pending[MAX_MERGE_PENDING];
+
+  /* The variable MIN_GALLOP, initialized to GALLOP_WIN_MIN, controls
+     when we get *into* galloping mode.  merge_lo and merge_hi tend to
+     nudge it higher for random data, and lower for highly structured
+     data.  */
+
+  ptrdiff_t min_gallop;
+
+  /* 'A' is temporary storage, able to hold ALLOCED elements, to help
+     with merges.  If temporary storage is passed to the sorting entry
+     function, 'A' will point to it.  Otherwise 'A' initially points
+     to TEMPARRAY, and subsequently to newly allocated memory if
+     needed.  */
+
+  Lisp_Object *a;
+  ptrdiff_t alloced;
+  specpdl_ref count;
+  Lisp_Object temparray[MERGESTATE_TEMP_SIZE];
+
+  /* If an exception is thrown while merging we might have to relocate
+     some list elements from temporary storage back into the list.
+     RELOC keeps track of the information needed to do this.  */
+
+  struct reloc reloc;
+
+  /* PREDICATE is the lisp comparison predicate for the sort.  */
+
+  Lisp_Object predicate;
+} merge_state;
+
+
+/* INORDER returns true iff (PREDICATE A B) is non-nil.  */
+
+inline bool
+inorder (const Lisp_Object predicate, const Lisp_Object a, const Lisp_Object b)
+{
+  return !NILP (call2 (predicate, a, b));
+}
+
+
+/* BINARYSORT() is a stable binary insertion sort used for sorting the
+   list starting at LO and ending at HI.  On entry, LO <= START <= HI,
+   and [LO, START) is already sorted (pass START == LO if you don't
+   know!).  Even in case of error, the output slice will be some
+   permutation of the input (nothing is lost or duplicated).  */
+
+static void
+binarysort (merge_state *ms, Lisp_Object *lo, const Lisp_Object *hi,
+	    Lisp_Object *start)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (lo <= start && start <= hi);
+  if (lo == start)
+    ++start;
+  for (; start < hi; ++start)
+    {
+      Lisp_Object *l = lo;
+      Lisp_Object *r = start;
+      Lisp_Object pivot = *r;
+      Lisp_Object *p;
+
+      eassume (l < r);
+      do {
+	p = l + ((r - l) >> 1);
+	if (inorder (pred, pivot, *p))
+	  r = p;
+	else
+	  l = p + 1;
+      } while (l < r);
+      eassume (l == r);
+      for (p = start; p > l; --p)
+	p[0] = p[-1];
+      *l = pivot;
+    }
+}
+
+
+/*  COUNT_RUN() returns the length of the run beginning at LO, in the
+    slice [LO, HI) with LO < HI.  A "run" is the longest
+    non-decreasing sequence or the longest strictly decreasing
+    sequence, with the Boolean *DESCENDING set to 0 in the former
+    case, or to 1 in the latter.  The strictness of the definition of
+    "descending" is needed so that the caller can safely reverse a
+    descending sequence without violating stability (strict > ensures
+    there are no equal elements to get out of order).  */
+
+static ptrdiff_t
+count_run (merge_state *ms, Lisp_Object *lo, const Lisp_Object *hi, bool *descending)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (lo < hi);
+  *descending = 0;
+  ++lo;
+  ptrdiff_t n = 1;
+  if (lo == hi)
+    return n;
+
+  n = 2;
+  if (inorder (pred, lo[0], lo[-1]))
+    {
+      *descending = 1;
+      for (lo = lo + 1; lo < hi; ++lo, ++n)
+	{
+	  if (!inorder (pred, lo[0], lo[-1]))
+	    break;
+	}
+    }
+  else
+    {
+      for (lo = lo + 1; lo < hi; ++lo, ++n)
+	{
+	  if (inorder (pred, lo[0], lo[-1]))
+	    break;
+	}
+    }
+
+  return n;
+}
+
+
+/*  GALLOP_LEFT() locates the proper position of KEY in a sorted
+    vector: if the vector contains an element equal to KEY, return the
+    position immediately to the left of the leftmost equal element.
+    [GALLOP_RIGHT() does the same except returns the position to the
+    right of the rightmost equal element (if any).]
+
+    'A' is a sorted vector with N elements, starting at A[0].  N must be > 0.
+
+    HINT is an index at which to begin the search, 0 <= HINT < N.  The closer
+    HINT is to the final result, the faster this runs.
+
+    The return value is the int k in [0, N] such that
+
+    A[k-1] < KEY <= a[k]
+
+    pretending that *(A-1) is minus infinity and A[N] is plus infinity.  IOW,
+    KEY belongs at index k; or, IOW, the first k elements of A should precede
+    KEY, and the last N-k should follow KEY.  */
+
+static ptrdiff_t
+gallop_left (merge_state *ms, const Lisp_Object key, Lisp_Object *a,
+	     const ptrdiff_t n, const ptrdiff_t hint)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (key && a && n > 0 && hint >= 0 && hint < n);
+
+  a += hint;
+  ptrdiff_t lastofs = 0;
+  ptrdiff_t ofs = 1;
+  if (inorder (pred, *a, key))
+    {
+      /* When a[hint] < key, gallop right until
+	 a[hint + lastofs] < key <= a[hint + ofs].  */
+      const ptrdiff_t maxofs = n - hint; /* This is one after the end of a.  */
+      while (ofs < maxofs)
+	{
+	  if (inorder (pred, a[ofs], key))
+	    {
+	      lastofs = ofs;
+	      eassume (ofs <= (PTRDIFF_MAX - 1) / 2);
+	      ofs = (ofs << 1) + 1;
+	    }
+	  else
+	    break; /* Here key <= a[hint+ofs].  */
+	}
+      if (ofs > maxofs)
+	ofs = maxofs;
+      /* Translate back to offsets relative to &a[0].  */
+      lastofs += hint;
+      ofs += hint;
+    }
+  else
+    {
+      /* When key <= a[hint], gallop left, until
+	 a[hint - ofs] < key <= a[hint - lastofs].  */
+      const ptrdiff_t maxofs = hint + 1;             /* Here &a[0] is lowest.  */
+      while (ofs < maxofs)
+	{
+	  if (inorder (pred, a[-ofs], key))
+	    break;
+	  /* Here key <= a[hint - ofs].  */
+	  lastofs = ofs;
+	  eassume (ofs <= (PTRDIFF_MAX - 1) / 2);
+	  ofs = (ofs << 1) + 1;
+	}
+      if (ofs > maxofs)
+	ofs = maxofs;
+      /* Translate back to use positive offsets relative to &a[0].  */
+      ptrdiff_t k = lastofs;
+      lastofs = hint - ofs;
+      ofs = hint - k;
+    }
+  a -= hint;
+
+  eassume (-1 <= lastofs && lastofs < ofs && ofs <= n);
+  /* Now a[lastofs] < key <= a[ofs], so key belongs somewhere to the
+     right of lastofs but no farther right than ofs.  Do a binary
+     search, with invariant a[lastofs-1] < key <= a[ofs].  */
+  ++lastofs;
+  while (lastofs < ofs)
+    {
+      ptrdiff_t m = lastofs + ((ofs - lastofs) >> 1);
+
+      if (inorder (pred, a[m], key))
+	lastofs = m + 1;              /* Here a[m] < key.  */
+      else
+	ofs = m;                    /* Here key <= a[m].  */
+    }
+  eassume (lastofs == ofs);             /* Then a[ofs-1] < key <= a[ofs].  */
+  return ofs;
+}
+
+
+/*  GALLOP_RIGHT() is exactly like GALLOP_LEFT(), except that if KEY
+    already exists in A[0:N], it finds the position immediately to the
+    right of the rightmost equal value.
+
+    The return value is the int k in [0, N] such that
+
+    A[k-1] <= KEY < A[k].  */
+
+static ptrdiff_t
+gallop_right (merge_state *ms, const Lisp_Object key, Lisp_Object *a,
+	      const ptrdiff_t n, const ptrdiff_t hint)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (key && a && n > 0 && hint >= 0 && hint < n);
+
+  a += hint;
+  ptrdiff_t lastofs = 0;
+  ptrdiff_t ofs = 1;
+  if (inorder (pred, key, *a))
+    {
+      /* When key < a[hint], gallop left until
+	 a[hint - ofs] <= key < a[hint - lastofs].  */
+      const ptrdiff_t maxofs = hint + 1;             /* Here &a[0] is lowest.  */
+      while (ofs < maxofs)
+	{
+	  if (inorder (pred, key, a[-ofs]))
+	    {
+	      lastofs = ofs;
+	      eassume (ofs <= (PTRDIFF_MAX - 1) / 2);
+	      ofs = (ofs << 1) + 1;
+	    }
+	  else                /* Here a[hint - ofs] <= key.  */
+	    break;
+	}
+      if (ofs > maxofs)
+	ofs = maxofs;
+      /* Translate back to use positive offsets relative to &a[0].  */
+      ptrdiff_t k = lastofs;
+      lastofs = hint - ofs;
+      ofs = hint - k;
+    }
+  else
+    {
+      /* When a[hint] <= key, gallop right, until
+	 a[hint + lastofs] <= key < a[hint + ofs].  */
+      const ptrdiff_t maxofs = n - hint;             /* Here &a[n-1] is highest.  */
+      while (ofs < maxofs)
+	{
+	  if (inorder (pred, key, a[ofs]))
+	    break;
+	  /* Here a[hint + ofs] <= key.  */
+	  lastofs = ofs;
+	  eassume (ofs <= (PTRDIFF_MAX - 1) / 2);
+	  ofs = (ofs << 1) + 1;
+	}
+      if (ofs > maxofs)
+	ofs = maxofs;
+      /* Translate back to use offsets relative to &a[0].  */
+      lastofs += hint;
+      ofs += hint;
+    }
+  a -= hint;
+
+  eassume (-1 <= lastofs && lastofs < ofs && ofs <= n);
+  /* Now a[lastofs] <= key < a[ofs], so key belongs somewhere to the
+     right of lastofs but no farther right than ofs.  Do a binary
+     search, with invariant a[lastofs-1] <= key < a[ofs].  */
+  ++lastofs;
+  while (lastofs < ofs)
+    {
+      ptrdiff_t m = lastofs + ((ofs - lastofs) >> 1);
+
+      if (inorder (pred, key, a[m]))
+	ofs = m;                    /* Here key < a[m].  */
+      else
+	lastofs = m + 1;              /* Here a[m] <= key.  */
+    }
+  eassume (lastofs == ofs);             /* Now  a[ofs-1] <= key < a[ofs].  */
+  return ofs;
+}
+
+
+static void
+merge_init (merge_state *ms, const ptrdiff_t list_size, Lisp_Object *lo,
+	    const Lisp_Object predicate)
+{
+  eassume (ms != NULL);
+
+  ms->a = ms->temparray;
+  ms->alloced = MERGESTATE_TEMP_SIZE;
+
+  ms->n = 0;
+  ms->min_gallop = GALLOP_WIN_MIN;
+  ms->listlen = list_size;
+  ms->listbase = lo;
+  ms->predicate = predicate;
+  ms->reloc = (struct reloc){NULL, NULL, NULL, 0};
+}
+
+
+/* The dynamically allocated memory may hold lisp objects during
+   merging.  MERGE_MARKMEM marks them so they aren't reaped during
+   GC.  */
+
+static void
+merge_markmem (void *arg)
+{
+  merge_state *ms = arg;
+  eassume (ms != NULL);
+
+  if (ms->reloc.size != NULL && *ms->reloc.size > 0)
+    {
+      eassume (ms->reloc.src != NULL);
+      mark_objects (*ms->reloc.src, *ms->reloc.size);
+    }
+}
+
+
+/* CLEANUP_MEM frees all temp storage.  If an exception occurs while
+   merging it will first relocate any lisp elements in temp storage
+   back to the original array.  */
+
+static void
+cleanup_mem (void *arg)
+{
+  merge_state *ms = arg;
+  eassume (ms != NULL);
+
+  /* If we have an exception while merging, some of the list elements
+     might only live in temp storage; we copy everything remaining in
+     the temp storage back into the original list.  This ensures that
+     the original list has all of the original elements, although
+     their order is unpredictable.  */
+
+  if (ms->reloc.order != 0 && *ms->reloc.size > 0)
+    {
+      eassume (*ms->reloc.src != NULL && *ms->reloc.dst != NULL);
+      ptrdiff_t n = *ms->reloc.size;
+      ptrdiff_t shift = ms->reloc.order == -1 ? 0 : n - 1;
+      memcpy (*ms->reloc.dst - shift, *ms->reloc.src, n * word_size);
+    }
+
+  /* Free any remaining temp storage.  */
+  xfree (ms->a);
+}
+
+
+/* MERGE_GETMEM() ensures availability of enough temp memory for NEED
+   array slots.  Any previously allocated memory is first freed, and a
+   cleanup routine is registered to free memory at the very end, or on
+   exception.  */
+
+static void
+merge_getmem (merge_state *ms, const ptrdiff_t need)
+{
+  eassume (ms != NULL);
+
+  if (ms->a == ms->temparray)
+    {
+      /* We only get here if alloc is needed and this is the first
+	 time, so we set up the unwind.  */
+      specpdl_ref count = SPECPDL_INDEX ();
+      record_unwind_protect_ptr_mark (cleanup_mem, ms, merge_markmem);
+      ms->count = count;
+    }
+  else
+    {
+      /* We have previously alloced storage.  Since we don't care
+         what's in the block we don't use realloc which would waste
+         cycles copying the old data.  We just free and alloc
+         again.  */
+      xfree (ms->a);
+      ms->a = NULL;
+    }
+  ms->a = (Lisp_Object *) xmalloc (need * word_size);
+  if (ms->a != NULL)
+    ms->alloced = need;
+}
+
+
+/* MERGE_LO() stably merges the NA elements starting at SSA with the
+   NB elements starting at SSB = SSA + NA, in-place.  NA and NB must
+   be positive.  We also require that SSA[NA-1] belongs at the end of
+   the merge, and should have NA <= NB.  */
+
+static void
+merge_lo (merge_state *ms, Lisp_Object *ssa, ptrdiff_t na, Lisp_Object *ssb,
+	  ptrdiff_t nb)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (ms && ssa && ssb && na > 0 && nb > 0);
+  eassume (ssa + na == ssb);
+  na <= ms->alloced ? 0 : merge_getmem (ms, na);
+
+  memcpy (ms->a, ssa, na * word_size);
+  Lisp_Object *dest = ssa;
+  ssa = ms->a;
+
+  ms->reloc = (struct reloc){&ssa, &dest, &na, -1};
+
+  *dest++ = *ssb++;
+  --nb;
+  if (nb == 0)
+    goto Succeed;
+  if (na == 1)
+    goto CopyB;
+
+  ptrdiff_t min_gallop = ms->min_gallop;
+  for (;;)
+    {
+      ptrdiff_t acount = 0;  /* This holds the # of consecutive times A won.  */
+
+      ptrdiff_t bcount = 0;  /* This holds the # of consecutive times B won.  */
+
+      for (;;)
+	{
+	  eassume (na > 1 && nb > 0);
+	  if (inorder (pred, *ssb, *ssa))
+	    {
+	      *dest++ = *ssb++ ;
+	      ++bcount;
+	      acount = 0;
+	      --nb;
+	      if (nb == 0)
+		goto Succeed;
+	      if (bcount >= min_gallop)
+		break;
+	    }
+	  else
+	    {
+	      *dest++ = *ssa++;
+	      ++acount;
+	      bcount = 0;
+	      --na;
+	      if (na == 1)
+		goto CopyB;
+	      if (acount >= min_gallop)
+		break;
+	    }
+	}
+
+      /* One run is winning so consistently that galloping may be a huge
+	 win.  We try that, and continue galloping until (if ever)
+	 neither run appears to be winning consistently anymore.  */
+      ++min_gallop;
+      do {
+	eassume (na > 1 && nb > 0);
+	min_gallop -= min_gallop > 1;
+	ms->min_gallop = min_gallop;
+	ptrdiff_t k = gallop_right (ms, ssb[0], ssa, na, 0);
+	acount = k;
+	if (k)
+	  {
+	    memcpy (dest, ssa, k * word_size);
+	    dest += k;
+	    ssa += k;
+	    na -= k;
+	    if (na == 1)
+	      goto CopyB;
+	    /* While na==0 is impossible now if the comparison function is
+	       consistent, we shouldn't assume that it is.  */
+	    if (na == 0)
+	      goto Succeed;
+	  }
+	*dest++ = *ssb++ ;
+	--nb;
+	if (nb == 0)
+	  goto Succeed;
+
+	k = gallop_left (ms, ssa[0], ssb, nb, 0);
+	bcount = k;
+	if (k)
+	  {
+	    memmove (dest, ssb, k * word_size);
+	    dest += k;
+	    ssb += k;
+	    nb -= k;
+	    if (nb == 0)
+	      goto Succeed;
+	  }
+	*dest++ = *ssa++;
+	--na;
+	if (na == 1)
+	  goto CopyB;
+      } while (acount >= GALLOP_WIN_MIN || bcount >= GALLOP_WIN_MIN);
+      ++min_gallop;   /* Apply a penalty for leaving galloping mode.  */
+      ms->min_gallop = min_gallop;
+    }
+ Succeed:
+  ms->reloc = (struct reloc){NULL, NULL, NULL, 0};
+
+  if (na)
+    memcpy (dest, ssa, na * word_size);
+  return;
+ CopyB:
+  eassume (na == 1 && nb > 0);
+  ms->reloc = (struct reloc){NULL, NULL, NULL, 0};
+
+  /* The last element of ssa belongs at the end of the merge.  */
+  memmove (dest, ssb, nb * word_size);
+  dest[nb] = ssa[0];
+}
+
+
+/* MERGE_HI() stably merges the NA elements starting at SSA with the
+   NB elements starting at SSB = SSA + NA, in-place.  NA and NB must
+   be positive.  We also require that SSA[NA-1] belongs at the end of
+   the merge, and should have NA >= NB.  */
+
+static void
+merge_hi (merge_state *ms, Lisp_Object *ssa, ptrdiff_t na,
+	  Lisp_Object *ssb, ptrdiff_t nb)
+{
+  Lisp_Object pred = ms->predicate;
+
+  eassume (ms && ssa && ssb && na > 0 && nb > 0);
+  eassume (ssa + na == ssb);
+  nb <= ms->alloced ? 0 : merge_getmem(ms, nb);
+  Lisp_Object *dest = ssb;
+  dest += nb - 1;
+  memcpy(ms->a, ssb, nb * word_size);
+  Lisp_Object *basea = ssa;
+  Lisp_Object *baseb = ms->a;
+  ssb = ms->a + nb - 1;
+  ssa += na - 1;
+
+  ms->reloc = (struct reloc){&baseb, &dest, &nb, 1};
+
+  *dest-- = *ssa--;
+  --na;
+  if (na == 0)
+    goto Succeed;
+  if (nb == 1)
+    goto CopyA;
+
+  ptrdiff_t min_gallop = ms->min_gallop;
+  for (;;) {
+    ptrdiff_t acount = 0;   /* This holds the # of consecutive times A won.  */
+    ptrdiff_t bcount = 0;   /* This holds the # of consecutive times B won.  */
+
+    for (;;) {
+      eassume (na > 0 && nb > 1);
+      if (inorder (pred, *ssb, *ssa))
+	{
+	  *dest-- = *ssa--;
+	  ++acount;
+	  bcount = 0;
+	  --na;
+	  if (na == 0)
+	    goto Succeed;
+	  if (acount >= min_gallop)
+	    break;
+	}
+      else
+	{
+	  *dest-- = *ssb--;
+	  ++bcount;
+	  acount = 0;
+	  --nb;
+	  if (nb == 1)
+	    goto CopyA;
+	  if (bcount >= min_gallop)
+	    break;
+	}
+    }
+
+    /* One run is winning so consistently that galloping may be a huge
+       win.  Try that, and continue galloping until (if ever) neither
+       run appears to be winning consistently anymore.  */
+    ++min_gallop;
+    do {
+      eassume (na > 0 && nb > 1);
+      min_gallop -= min_gallop > 1;
+      ms->min_gallop = min_gallop;
+      ptrdiff_t k = gallop_right (ms, ssb[0], basea, na, na - 1);
+      k = na - k;
+      acount = k;
+      if (k)
+	{
+	  dest += -k;
+	  ssa += -k;
+	  memmove(dest + 1, ssa + 1, k * word_size);
+	  na -= k;
+	  if (na == 0)
+	    goto Succeed;
+	}
+      *dest-- = *ssb--;
+      --nb;
+      if (nb == 1)
+	goto CopyA;
+
+      k = gallop_left (ms, ssa[0], baseb, nb, nb - 1);
+      k = nb - k;
+      bcount = k;
+      if (k)
+	{
+	  dest += -k;
+	  ssb += -k;
+	  memcpy(dest + 1, ssb + 1, k * word_size);
+	  nb -= k;
+	  if (nb == 1)
+	    goto CopyA;
+	  /* While nb==0 is impossible now if the comparison function
+	     is consistent, we shouldn't assume that it is.  */
+	  if (nb == 0)
+	    goto Succeed;
+	}
+      *dest-- = *ssa--;
+      --na;
+      if (na == 0)
+	goto Succeed;
+    } while (acount >= GALLOP_WIN_MIN || bcount >= GALLOP_WIN_MIN);
+    ++min_gallop;      /* Apply a penalty for leaving galloping mode.  */
+    ms->min_gallop = min_gallop;
+  }
+ Succeed:
+  ms->reloc = (struct reloc){NULL, NULL, NULL, 0};
+  if (nb)
+    memcpy (dest - nb + 1, baseb, nb * word_size);
+  return;
+ CopyA:
+  eassume (nb == 1 && na > 0);
+  ms->reloc = (struct reloc){NULL, NULL, NULL, 0};
+  /* The first element of ssb belongs at the front of the merge.  */
+  memmove (dest + 1 - na, ssa + 1 - na, na * word_size);
+  dest += -na;
+  ssa += -na;
+  dest[0] = ssb[0];
+}
+
+
+/* MERGE_AT() merges the two runs at stack indices I and I+1.  */
+
+static void
+merge_at (merge_state *ms, const ptrdiff_t i)
+{
+  eassume (ms != NULL);
+  eassume (ms->n >= 2);
+  eassume (i >= 0);
+  eassume (i == ms->n - 2 || i == ms->n - 3);
+
+  Lisp_Object *ssa = ms->pending[i].base;
+  ptrdiff_t na = ms->pending[i].len;
+  Lisp_Object *ssb = ms->pending[i + 1].base;
+  ptrdiff_t nb = ms->pending[i + 1].len;
+  eassume (na > 0 && nb > 0);
+  eassume (ssa + na == ssb);
+
+  /* Record the length of the combined runs; if i is the 3rd-last run
+     now, also slide over the last run (which isn't involved in this
+     merge).  The current run i+1 goes away in any case.  */
+  ms->pending[i].len = na + nb;
+  if (i == ms->n - 3)
+    ms->pending[i + 1] = ms->pending[i + 2];
+  --ms->n;
+
+  /* Where does b start in a?  Elements in a before that can be
+     ignored (they are already in place).  */
+  ptrdiff_t k = gallop_right (ms, *ssb, ssa, na, 0);
+  eassume (k >= 0);
+  ssa += k;
+  na -= k;
+  if (na == 0)
+    return;
+
+  /* Where does a end in b?  Elements in b after that can be ignored
+     (they are already in place).  */
+  nb = gallop_left (ms, ssa[na - 1], ssb, nb, nb - 1);
+  if (nb == 0)
+    return;
+  eassume (nb > 0);
+  /* Merge what remains of the runs using a temp array with size
+     min(na, nb) elements.  */
+  if (na <= nb)
+    merge_lo (ms, ssa, na, ssb, nb);
+  else
+    merge_hi (ms, ssa, na, ssb, nb);
+}
+
+
+/* POWERLOOP() computes the "power" of the first of two adjacent runs
+   begining at index S1, with the first having length N1 and the
+   second (starting at index S1+N1) having length N2.  The list has
+   total length N.  */
+
+static int
+powerloop (const ptrdiff_t s1, const ptrdiff_t n1, const ptrdiff_t n2,
+	   const ptrdiff_t n)
+{
+  eassume (s1 >= 0);
+  eassume (n1 > 0 && n2 > 0);
+  eassume (s1 + n1 + n2 <= n);
+  /* The midpoints a and b are
+     a = s1 + n1/2
+     b = s1 + n1 + n2/2 = a + (n1 + n2)/2
+
+     These may not be integers because of the "/2", so we work with
+     2*a and 2*b instead.  It makes no difference to the outcome,
+     since the bits in the expansion of (2*i)/n are merely shifted one
+     position from those of i/n.  */
+  ptrdiff_t a = 2 * s1 + n1;
+  ptrdiff_t b = a + n1 + n2;
+  int result = 0;
+  /* Emulate a/n and b/n one bit a time, until their bits differ.  */
+  for (;;)
+    {
+      ++result;
+      if (a >= n)
+	{  /* Both quotient bits are now 1.  */
+	  eassume (b >= a);
+	  a -= n;
+	  b -= n;
+	}
+      else if (b >= n)
+	{  /* a/n bit is 0 and b/n bit is 1.  */
+	  break;
+	} /* Otherwise both quotient bits are 0.  */
+      eassume (a < b && b < n);
+      a <<= 1;
+      b <<= 1;
+    }
+  return result;
+}
+
+
+/* FOUND_NEW_RUN() updates the state when a run of length N2 has been
+   identified.  If there's already a stretch on the stack, apply the
+   "powersort" merge strategy: compute the topmost stretch's "power"
+   (depth in a conceptual binary merge tree) and merge adjacent runs
+   on the stack with greater power.  */
+
+static void
+found_new_run (merge_state *ms, const ptrdiff_t n2)
+{
+  eassume (ms != NULL);
+  if (ms->n)
+    {
+      eassume (ms->n > 0);
+      struct stretch *p = ms->pending;
+      ptrdiff_t s1 = p[ms->n - 1].base - ms->listbase;
+      ptrdiff_t n1 = p[ms->n - 1].len;
+      int power = powerloop (s1, n1, n2, ms->listlen);
+      while (ms->n > 1 && p[ms->n - 2].power > power)
+	{
+	  merge_at (ms, ms->n - 2);
+	}
+      eassume (ms->n < 2 || p[ms->n - 2].power < power);
+      p[ms->n - 1].power = power;
+    }
+}
+
+
+/* MERGE_FORCE_COLLAPSE() unconditionally merges all stretches on the
+   stack until only one remains, and returns 0 on success.  This is
+   used at the end of the mergesort.  */
+
+static void
+merge_force_collapse (merge_state *ms)
+{
+  struct stretch *p = ms->pending;
+
+  eassume (ms != NULL);
+  while (ms->n > 1)
+    {
+      ptrdiff_t n = ms->n - 2;
+      if (n > 0 && p[n - 1].len < p[n + 1].len)
+	--n;
+      merge_at (ms, n);
+    }
+}
+
+
+/* MERGE_COMPUTE_MINRUN() computes a good value for the minimum run
+   length; natural runs shorter than this are boosted artificially via
+   binary insertion.
+
+   If N < 64, return N (it's too small to bother with fancy stuff).
+   Otherwise if N is an exact power of 2, return 32.  Finally, return
+   an int k, 32 <= k <= 64, such that N/k is close to, but strictly
+   less than, an exact power of 2.  */
+
+static ptrdiff_t
+merge_compute_minrun (ptrdiff_t n)
+{
+  ptrdiff_t r = 0;           /* r will become 1 if any non-zero bits are
+				shifted off.  */
+
+  eassume (n >= 0);
+  while (n >= 64)
+    {
+      r |= n & 1;
+      n >>= 1;
+    }
+  return n + r;
+}
+
+
+static void
+reverse_vector (Lisp_Object *s, const ptrdiff_t n)
+{
+  for (ptrdiff_t i = 0; i < n / 2; i++)
+    {
+      Lisp_Object tem = s[i];
+      s[i] =  s[n - i - 1];
+      s[n - i - 1] = tem;
+    }
+}
+
+/* TIM_SORT sorts the array SEQ with LENGTH elements in the order
+   determined by PREDICATE.  */
+
+void
+tim_sort (Lisp_Object predicate, Lisp_Object *seq, const ptrdiff_t length)
+{
+  merge_state ms;
+  Lisp_Object *lo = seq;
+
+  merge_init (&ms, length, lo, predicate);
+
+  if (length < 2)
+    return;
+
+  /* March over the array once, left to right, finding natural runs,
+     and extending short natural runs to minrun elements.  */
+  const ptrdiff_t minrun = merge_compute_minrun (length);
+  ptrdiff_t nremaining = length;
+  do {
+    bool descending;
+
+    /* Identify the next run.  */
+    ptrdiff_t n = count_run (&ms, lo, lo + nremaining, &descending);
+    if (descending)
+      reverse_vector (lo, n);
+    /* If the run is short, extend it to min(minrun, nremaining).  */
+    if (n < minrun)
+      {
+	const ptrdiff_t force = nremaining <= minrun ?
+	  nremaining : minrun;
+	binarysort (&ms, lo, lo + force, lo + n);
+	n = force;
+      }
+    eassume (ms.n == 0 || ms.pending[ms.n - 1].base +
+	     ms.pending[ms.n - 1].len == lo);
+    found_new_run (&ms, n);
+    /* Push the new run on to the stack.  */
+    eassume (ms.n < MAX_MERGE_PENDING);
+    ms.pending[ms.n].base = lo;
+    ms.pending[ms.n].len = n;
+    ++ms.n;
+    /* Advance to find the next run.  */
+    lo += n;
+    nremaining -= n;
+  } while (nremaining);
+
+  merge_force_collapse (&ms);
+  eassume (ms.n == 1);
+  eassume (ms.pending[0].len == length);
+  lo = ms.pending[0].base;
+
+  if (ms.a != ms.temparray && ms.alloced <= ms.listlen >> 1)
+    unbind_to (ms.count, Qnil);
+}
-- 
2.39.5