From 358fd42aabec56e471ed3c8e6f3dccbc305ff6f7 Mon Sep 17 00:00:00 2001 From: Jonathan Wakely <jwakely@redhat.com> Date: Tue, 23 Jan 2024 14:57:15 +0000 Subject: [PATCH] libstdc++: Add "ASCII" as an alias for std::text_encoding::id::ASCII As noted in LWG 4043, "ASCII" is not an alias for any known registered character encoding, so std::text_encoding("ASCII").mib() == id::other. Add the alias "ASCII" to the implementation-defined superset of aliases for that encoding. libstdc++-v3/ChangeLog: * include/bits/text_encoding-data.h: Regenerate. * scripts/gen_text_encoding_data.py: Add extra_aliases dict containing "ASCII". * testsuite/std/text_encoding/cons.cc: Check "ascii" is known. Co-authored-by: Ewan Higgs <ewan.higgs@gmail.com> Signed-off-by: Ewan Higgs <ewan.higgs@gmail.com> --- .../include/bits/text_encoding-data.h | 3 ++- .../scripts/gen_text_encoding_data.py | 24 ++++++++++++++++++- .../testsuite/std/text_encoding/cons.cc | 5 ++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/libstdc++-v3/include/bits/text_encoding-data.h b/libstdc++-v3/include/bits/text_encoding-data.h index 7ac2e9dc3d92..5041e738d214 100644 --- a/libstdc++-v3/include/bits/text_encoding-data.h +++ b/libstdc++-v3/include/bits/text_encoding-data.h @@ -14,6 +14,7 @@ { 3, "IBM367" }, { 3, "cp367" }, { 3, "csASCII" }, + { 3, "ASCII" }, // libstdc++ extension { 4, "ISO_8859-1:1987" }, { 4, "iso-ir-100" }, { 4, "ISO_8859-1" }, @@ -417,7 +418,7 @@ { 104, "csISO2022CN" }, { 105, "ISO-2022-CN-EXT" }, { 105, "csISO2022CNEXT" }, -#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET 413 +#define _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET 414 { 106, "UTF-8" }, { 106, "csUTF8" }, { 109, "ISO-8859-13" }, diff --git a/libstdc++-v3/scripts/gen_text_encoding_data.py b/libstdc++-v3/scripts/gen_text_encoding_data.py index 2d6f3e4077a4..f0ebb42d8c20 100755 --- a/libstdc++-v3/scripts/gen_text_encoding_data.py +++ b/libstdc++-v3/scripts/gen_text_encoding_data.py @@ -36,6 +36,18 @@ print("#ifndef _GLIBCXX_GET_ENCODING_DATA") print('# error "This is not a public header, do not include it directly"') print("#endif\n") +# We need to generate a list of initializers of the form { mib, alias }, e.g., +# { 3, "US-ASCII" }, +# { 3, "ISO646-US" }, +# { 3, "csASCII" }, +# { 4, "ISO_8859-1:1987" }, +# { 4, "latin1" }, +# The initializers must be sorted by the mib value. The first entry for +# a given mib must be the primary name for the encoding. Any aliases for +# the encoding come after the primary name. +# We also define a macro _GLIBCXX_TEXT_ENCODING_UTF8_OFFSET which is the +# offset into the list of the mib=106, alias="UTF-8" entry. This is used +# to optimize the common case, so we don't need to search for "UTF-8". charsets = {} with open(sys.argv[1], newline='') as f: @@ -52,10 +64,15 @@ with open(sys.argv[1], newline='') as f: aliases.remove(name) charsets[mib] = [name] + aliases -# Remove "NATS-DANO" and "NATS-DANO-ADD" +# Remove "NATS-DANO" and "NATS-DANO-ADD" as specified by the C++ standard. charsets.pop(33, None) charsets.pop(34, None) +# This is not an official IANA alias, but we include it in the +# implementation-defined superset of aliases for US-ASCII. +# See also LWG 4043. +extra_aliases = {3: ["ASCII"]} + count = 0 for mib in sorted(charsets.keys()): names = charsets[mib] @@ -64,6 +81,11 @@ for mib in sorted(charsets.keys()): for name in names: print(' {{ {:4}, "{}" }},'.format(mib, name)) count += len(names) + if mib in extra_aliases: + names = extra_aliases[mib] + for name in names: + print(' {{ {:4}, "{}" }}, // libstdc++ extension'.format(mib, name)) + count += len(names) # <text_encoding> gives an error if this macro is left defined. # Do this last, so that the generated output is not usable unless we reach here. diff --git a/libstdc++-v3/testsuite/std/text_encoding/cons.cc b/libstdc++-v3/testsuite/std/text_encoding/cons.cc index b9d93641de44..8fcc2ec8c3b9 100644 --- a/libstdc++-v3/testsuite/std/text_encoding/cons.cc +++ b/libstdc++-v3/testsuite/std/text_encoding/cons.cc @@ -53,6 +53,11 @@ test_construct_by_name() VERIFY( e4.name() == s ); VERIFY( ! e4.aliases().empty() ); VERIFY( e4.aliases().front() == "US-ASCII"sv ); // primary name + + s = "ascii"; + std::text_encoding e5(s); + VERIFY( e5.mib() == std::text_encoding::ASCII ); + VERIFY( e5.name() == s ); } constexpr void -- GitLab