From fa6549c1f0e75ff33cb641d98af72ee354b04bbe Mon Sep 17 00:00:00 2001
From: Jonathan Wakely <jwakely@redhat.com>
Date: Wed, 18 Dec 2024 12:57:14 +0000
Subject: [PATCH] libstdc++: Handle errors from strxfrm in
 std::collate::transform [PR85824]

std::regex builds a cache of equivalence classes by calling
std::regex_traits<char>::transform_primary(c) for every char, which then
calls std::collate<char>::transform which calls strxfrm. On several
targets strxfrm fails for non-ASCII characters. Because strxfrm has no
return value reserved to indicate an error, some implementations return
INT_MAX or SIZE_MAX. This causes std::collate::transform to try to
allocate a huge buffer, which is either very slow or throws
std::bad_alloc. We should check errno after calling strxfrm to detect
errors and then throw a more appropriate exception instead of trying to
allocate a huge buffer.

Unfortunately the std::collate<C>::_M_transform function has a
non-throwing exception specifier, so we can't do the error handling
there.

As well as checking errno, this patch changes std::collate::do_transform
to use __builtin_alloca for small inputs, and to use RAII to deallocate
the buffers used for large inputs.

This change isn't sufficient to fix the three std::regex bugs caused by
the lack of error handling in std::collate::do_transform, we also need
to make std::regex_traits::transform_primary handle exceptions. This
change also attempts to make transform_primary closer to the effects
described in the standard, by not even attempting to use std::collate if
the locale's std::collate facet has been replaced (see PR 118105).
Implementing the correct effects for transform_primary requires RTTI, so
that we don't use some user-defined std::collate facet with unknown
semantics. When -fno-rtti is used transform_primary just returns an
empty string, making equivalence classes unusable in std::basic_regex.
That's not ideal, but I don't have any better ideas.

I'm unsure if std::regex_traits<C>::transform_primary is supposed to
convert the string to lower case or not.  The general regex traits
requirements ([re.req] p20) do say "when character case is not
considered" but the specification for the std::regex_traits<char> and
std::regex_traits<wchar_t> specializations ([re.traits] p7) don't say
anything about that.

With the r15-6317-geb339c29ee42aa change, transform_primary is not
called unless the regex actually uses an equivalence class. But using an
equivalence class would still fail (or be incredibly slow) on some
targets. With this commit, equivalence classes should be usable on all
targets, without excessive memory allocations.

Arguably, we should not even try to call transform_primary for any char
values over 127, since they're never valid in locales that use UTF-8 or
7-bit ASCII, and probably for other charsets too. Handling 128
exceptions for every std::regex compilation is very inefficient, but at
least it now works instead of failing with std::bad_alloc, and no longer
allocates 128 x 2GB. Maybe for C++26 we could check the locale's
std::text_encoding and use that to decide whether to cache equivalence
classes for char values over 127.

libstdc++-v3/ChangeLog:

	PR libstdc++/85824
	PR libstdc++/94409
	PR libstdc++/98723
	PR libstdc++/118105
	* include/bits/locale_classes.tcc (collate::do_transform): Check
	errno after calling _M_transform. Use RAII type to manage the
	buffer and to restore errno.
	* include/bits/regex.h (regex_traits::transform_primary): Handle
	exceptions from std::collate::transform and do not try to use
	std::collate for user-defined facets.
---
 libstdc++-v3/include/bits/locale_classes.tcc | 94 ++++++++++++++------
 libstdc++-v3/include/bits/regex.h            | 46 +++++++---
 2 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/libstdc++-v3/include/bits/locale_classes.tcc b/libstdc++-v3/include/bits/locale_classes.tcc
index 52bd6d0f3524..ff1e943cdd6f 100644
--- a/libstdc++-v3/include/bits/locale_classes.tcc
+++ b/libstdc++-v3/include/bits/locale_classes.tcc
@@ -37,6 +37,9 @@
 #ifdef _GLIBCXX_SYSHDR
 #pragma GCC system_header
 #endif
+
+#include <cerrno>
+
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wc++11-extensions" // extern template
 #pragma GCC diagnostic ignored "-Wvariadic-macros"
@@ -295,43 +298,76 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
       size_t __len = (__hi - __lo) * 2;
 
-      _CharT* __c = new _CharT[__len];
+      struct _Buf
+      {
+	_Buf(size_t __n, void* __buf, int __e)
+	: _M_c(__buf ? (_CharT*)__buf : new _CharT[__n]),
+	  _M_stackbuf(__buf),
+	  _M_errno(__e)
+	{ }
 
-      __try
+	~_Buf()
+	{
+	  if (_M_c != _M_stackbuf)
+	    delete[] _M_c;
+	  if (errno == 0)
+	    errno = _M_errno;
+	}
+
+	void _M_realloc(size_t __len)
+	{
+	  _CharT* __p = new _CharT[__len];
+	  if (_M_c != _M_stackbuf)
+	    delete[] _M_c;
+	  _M_c = __p;
+	}
+
+	_CharT* _M_c;
+	void* const _M_stackbuf;
+	int _M_errno;
+      };
+
+      const size_t __bytes = __len * sizeof(_CharT);
+      _Buf __buf(__len, __bytes <= 256 ? __builtin_alloca(__bytes) : 0, errno);
+      errno = 0;
+
+      // strxfrm stops when it sees a nul character so we break
+      // the string into zero-terminated substrings and pass those
+      // to strxfrm.
+      for (;;)
 	{
-	  // strxfrm stops when it sees a nul character so we break
-	  // the string into zero-terminated substrings and pass those
-	  // to strxfrm.
-	  for (;;)
+	  // First try a buffer perhaps big enough.
+	  size_t __res = _M_transform(__buf._M_c, __p, __len);
+	  // If the buffer was not large enough, try again with the
+	  // correct size.
+	  if (__res >= __len)
 	    {
-	      // First try a buffer perhaps big enough.
-	      size_t __res = _M_transform(__c, __p, __len);
-	      // If the buffer was not large enough, try again with the
-	      // correct size.
-	      if (__res >= __len)
+	      if (__builtin_expect(errno, 0))
 		{
-		  __len = __res + 1;
-		  delete [] __c, __c = 0;
-		  __c = new _CharT[__len];
-		  __res = _M_transform(__c, __p, __len);
+#if __cpp_exceptions
+		  __throw_system_error(errno);
+#else
+		  // std::regex can call this function internally with
+		  // char values that always fail, so we don't want to
+		  // use _GLIBCXX_THROW_OR_ABORT here.
+		  __ret.clear();
+		  break;
+#endif
 		}
 
-	      __ret.append(__c, __res);
-	      __p += char_traits<_CharT>::length(__p);
-	      if (__p == __pend)
-		break;
-
-	      __p++;
-	      __ret.push_back(_CharT());
+	      __len = __res + 1;
+	      __buf._M_realloc(__len);
+	      __res = _M_transform(__buf._M_c, __p, __len);
 	    }
-	}
-      __catch(...)
-	{
-	  delete [] __c;
-	  __throw_exception_again;
-	}
 
-      delete [] __c;
+	  __ret.append(__buf._M_c, __res);
+	  __p += char_traits<_CharT>::length(__p);
+	  if (__p == __pend)
+	    break;
+
+	  __p++;
+	  __ret.push_back(_CharT());
+	}
 
       return __ret;
     }
diff --git a/libstdc++-v3/include/bits/regex.h b/libstdc++-v3/include/bits/regex.h
index 356e3f4e92a0..50d8863537d4 100644
--- a/libstdc++-v3/include/bits/regex.h
+++ b/libstdc++-v3/include/bits/regex.h
@@ -31,6 +31,9 @@
 #if __cplusplus >= 202002L
 # include <bits/iterator_concepts.h>	// std::default_sentinel_t
 #endif
+#if __cpp_rtti
+# include <typeinfo>
+#endif
 
 namespace std _GLIBCXX_VISIBILITY(default)
 {
@@ -253,9 +256,9 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
        * @param __first beginning of the character sequence.
        * @param __last  one-past-the-end of the character sequence.
        *
-       * Effects: if typeid(use_facet<collate<_Ch_type> >) ==
-       * typeid(collate_byname<_Ch_type>) and the form of the sort key
-       * returned by collate_byname<_Ch_type>::transform(__first, __last)
+       * Effects: if `typeid(use_facet<collate<_Ch_type>>(getloc())) ==
+       * typeid(collate_byname<_Ch_type>)` and the form of the sort key
+       * returned by `collate_byname<_Ch_type>::transform(__first, __last)`
        * is known and can be converted into a primary sort key
        * then returns that key, otherwise returns an empty string.
        *
@@ -265,17 +268,36 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
 	string_type
 	transform_primary(_Fwd_iter __first, _Fwd_iter __last) const
 	{
+	  string_type __ret;
+#if __cpp_rtti
+	  const auto& __fclt = use_facet<collate<char_type>>(_M_locale);
+	  if (typeid(__fclt) != typeid(collate<char_type>)) // FIXME: PR 118110
+	    return __ret;
+
 	  // TODO : this is not entirely correct.
 	  // This function requires extra support from the platform.
-	  //
-	  // Read http://gcc.gnu.org/ml/libstdc++/2013-09/msg00117.html and
-	  // http://www.open-std.org/Jtc1/sc22/wg21/docs/papers/2003/n1429.htm
-	  // for details.
-	  typedef std::ctype<char_type> __ctype_type;
-	  const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
-	  _GLIBCXX_STD_C::vector<char_type> __s(__first, __last);
-	  __fctyp.tolower(__s.data(), __s.data() + __s.size());
-	  return this->transform(__s.data(), __s.data() + __s.size());
+	  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118105
+
+	  const auto& __fctyp(use_facet<ctype<char_type>>(_M_locale));
+	  basic_string<char_type> __s(__first, __last);
+	  const auto __p = const_cast<char_type*>(__s.c_str());
+	  const auto __pend = __p + __s.size();
+	  // XXX: should we use tolower here? The regex traits requirements
+	  // say that transform_primary ignores case, but the specification
+	  // for the std::regex_traits<char> and std::regex_traits<wchar_t>
+	  // specializations don't, they seem to suggest just using the
+	  // collate::transform function to get a primary sort key.
+	  __fctyp.tolower(__p, __pend);
+
+	  __try
+	    {
+	      __ret = __fclt.transform(__p, __pend);
+	    }
+	  __catch (const exception&)
+	    {
+	    }
+#endif
+	  return __ret;
 	}
 
       /**
-- 
GitLab