From be4cbc6641fb8418d33e2e6c1967bb20beb8a7f0 Mon Sep 17 00:00:00 2001 From: Jason Rhinelander Date: Fri, 8 May 2020 13:57:29 -0300 Subject: [PATCH] Add base64 encoder/decoder --- lokimq/base64.h | 217 ++++++++++++++++++++++++++++++++++++++++ tests/test_encoding.cpp | 78 ++++++++++++++- 2 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 lokimq/base64.h diff --git a/lokimq/base64.h b/lokimq/base64.h new file mode 100644 index 0000000..31aa805 --- /dev/null +++ b/lokimq/base64.h @@ -0,0 +1,217 @@ +// Copyright (c) 2019-2020, The Loki Project +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without modification, are +// permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this list of +// conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, this list +// of conditions and the following disclaimer in the documentation and/or other +// materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors may be +// used to endorse or promote products derived from this software without specific +// prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +// THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#pragma once +#include "string_view.h" +#include +#include +#include + +namespace lokimq { + +namespace detail { + +/// Compile-time generated lookup tables for base64 conversion. +struct b64_table { + // Store the 0-63 decoded value of every possible char; all the chars that aren't valid are set + // to 0. (If you don't trust your data, check it with is_base64 first, which uses these 0's + // to detect invalid characters -- which is why we want a full 256 element array). + char from_b64_lut[256]; + // Store the encoded character of every 0-63 (6 bit) value. + char to_b64_lut[64]; + + // constexpr constructor that fills out the above (and should do it at compile time for any half + // decent compiler). + constexpr b64_table() noexcept : from_b64_lut{}, to_b64_lut{} { + for (unsigned char c = 0; c < 26; c++) { + from_b64_lut[(unsigned char)('A' + c)] = 0 + c; + to_b64_lut[ (unsigned char)( 0 + c)] = 'A' + c; + } + for (unsigned char c = 0; c < 26; c++) { + from_b64_lut[(unsigned char)('a' + c)] = 26 + c; + to_b64_lut[ (unsigned char)(26 + c)] = 'a' + c; + } + for (unsigned char c = 0; c < 10; c++) { + from_b64_lut[(unsigned char)('0' + c)] = 52 + c; + to_b64_lut[ (unsigned char)(52 + c)] = '0' + c; + } + to_b64_lut[62] = '+'; from_b64_lut[(unsigned char) '+'] = 62; + to_b64_lut[63] = '/'; from_b64_lut[(unsigned char) '/'] = 63; + } + // Convert a b64 encoded character into a 0-63 value + constexpr char from_b64(unsigned char c) const noexcept { return from_b64_lut[c]; } + // Convert a 0-31 value into a b64 encoded character + constexpr char to_b64(unsigned char b) const noexcept { return to_b64_lut[b]; } +} constexpr b64_lut; + +// This main point of this static assert is to force the compiler to compile-time build the constexpr tables. +static_assert(b64_lut.from_b64('/') == 63 && b64_lut.from_b64('7') == 59 && b64_lut.to_b64(38) == 'm', ""); + +} // namespace detail + +/// Converts bytes into a base64 encoded character sequence. +template +void to_base64(InputIt begin, InputIt end, OutputIt out) { + static_assert(sizeof(*begin) == 1, "to_base64 requires chars/bytes"); + int bits = 0; // Tracks the number of unconsumed bits held in r, will always be in {0, 2, 4} + std::uint_fast16_t r = 0; + while (begin != end) { + r = r << 8 | static_cast(*begin++); + + // we just added 8 bits, so we can *always* consume 6 to produce one character, so (net) we + // are adding 2 bits. + bits += 2; + *out++ = detail::b64_lut.to_b64(r >> bits); // Right-shift off the bits we aren't consuming right now + + // Drop the bits we don't want to keep (because we just consumed them) + r &= (1 << bits) - 1; + + if (bits == 6) { // We have enough bits to produce a second character (which means we had 4 before and added 8) + bits = 0; + *out++ = detail::b64_lut.to_b64(r); + r = 0; + } + } + + // If bits == 0 then we ended our 6-bit outputs coinciding with 8-bit values, i.e. at a multiple + // of 24 bits: this means we don't have anything else to output and don't need any padding. + if (bits == 2) { + // We finished with 2 unconsumed bits, which means we ended 1 byte past a 24-bit group (e.g. + // 1 byte, 4 bytes, 301 bytes, etc.); since we need to always be a multiple of 4 output + // characters that means we've produced 1: so we right-fill 0s to get the next char, then + // add two padding ='s. + *out++ = detail::b64_lut.to_b64(r << 4); + *out++ = '='; + *out++ = '='; + } else if (bits == 4) { + // 4 bits left means we produced 2 6-bit values from the first 2 bytes of a 3-byte group. + // Fill 0s to get the last one, plus one padding output. + *out++ = detail::b64_lut.to_b64(r << 2); + *out++ = '='; + } +} + +/// Creates a base64 string from an iterable, std::string-like object +inline std::string to_base64(string_view s) { + std::string base64; + base64.reserve((s.size() + 2) / 3 * 4); + to_base64(s.begin(), s.end(), std::back_inserter(base64)); + return base64; +} + +inline std::string to_base64(ustring_view s) { + std::string base64; + base64.reserve((s.size() + 2) / 3 * 4); + to_base64(s.begin(), s.end(), std::back_inserter(base64)); + return base64; +} + +/// Returns true if the range is a base64 encoded value; we allow (but do not require) '=' padding, +/// but only at the end, only 1 or 2, and only if it pads out the total to a multiple of 4. +template +constexpr bool is_base64(It begin, It end) { + static_assert(sizeof(*begin) == 1, "is_base64 requires chars/bytes"); + using std::distance; + using std::prev; + + // Allow 1 or 2 padding chars *if* they pad it to a multiple of 4. + if (begin != end && distance(begin, end) % 4 == 0) { + auto last = prev(end); + if (*last == '=') + end = last--; + if (*last == '=') + end = last; + } + + for (; begin != end; ++begin) { + auto c = *begin; + if (detail::b64_lut.from_b64(c) == 0 && c != 'A') + return false; + } + return true; +} + +/// Returns true if the string-like value is a base64 encoded value +constexpr bool is_base64(string_view s) { return is_base64(s.begin(), s.end()); } +constexpr bool is_base64(ustring_view s) { return is_base64(s.begin(), s.end()); } + +/// Converts a sequence of base64 digits to bytes. Undefined behaviour if any characters are not +/// valid base64 alphabet characters. It is permitted for the input and output ranges to overlap as +/// long as `out` is no earlier than `begin`. Trailing padding characters are permitted but not +/// required. +/// +/// It is possible to provide "impossible" base64 encoded values; for example "YWJja" which has 30 +/// bits of data even though a base64 encoded byte string should have 24 (4 chars) or 36 (6 chars) +/// bits for a 3- and 4-byte input, respectively. We ignore any such "impossible" bits, and +/// similarly ignore impossible bits in the bit "overhang"; that means "YWJjZA==" (the proper +/// encoding of "abcd") and "YWJjZB", "YWJjZC", ..., "YWJjZP" all decode to the same "abcd" value: +/// the last 4 bits of the last character are essentially considered padding. +template +void from_base64(InputIt begin, InputIt end, OutputIt out) { + using Char = decltype(*begin); + static_assert(sizeof(Char) == 1, "from_base64 requires chars/bytes"); + uint_fast16_t curr; + int bits = 0; // number of bits we've loaded into val; we always keep this < 8. + while (begin != end) { + Char c = *begin++; + + // padding; don't bother checking if we're at the end because is_base64 is a precondition + // and we're allowed UB if it isn't satisfied. + if (c == '=') continue; + + curr = curr << 6 | detail::b64_lut.from_b64(c); + if (bits == 0) + bits = 6; + else { + bits -= 2; // Added 6, removing 8 + *out++ = static_cast(curr >> bits); + curr &= (1 << bits) - 1; + } + } + // Don't worry about leftover bits because either they have to be 0, or they can't happen at + // all. See base32z.h for why: the reasoning is exactly the same (except using 6 bits per + // character here instead of 5). +} + +/// Converts base64 digits from a std::string-like object into a std::string of bytes. Undefined +/// behaviour if any characters are not valid base64 characters. +inline std::string from_base64(string_view s) { + std::string bytes; + bytes.reserve(s.size()*6 / 8); + from_base64(s.begin(), s.end(), std::back_inserter(bytes)); + return bytes; +} + +inline std::string from_base64(ustring_view s) { + std::string bytes; + bytes.reserve(s.size()*6 / 8); + from_base64(s.begin(), s.end(), std::back_inserter(bytes)); + return bytes; +} + +} diff --git a/tests/test_encoding.cpp b/tests/test_encoding.cpp index f11fb3e..49bc809 100644 --- a/tests/test_encoding.cpp +++ b/tests/test_encoding.cpp @@ -1,13 +1,20 @@ #include "lokimq/hex.h" -#include #include "lokimq/base32z.h" +#include "lokimq/base64.h" #include "common.h" using namespace std::literals; TEST_CASE("hex encoding/decoding", "[encoding][decoding][hex]") { REQUIRE( lokimq::to_hex("\xff\x42\x12\x34") == "ff421234"s ); + std::vector chars{{1, 10, 100, 254}}; + std::array out; + std::array expected{{'0', '1', '0', 'a', '6', '4', 'f', 'e'}}; + lokimq::to_hex(chars.begin(), chars.end(), out.begin()); + REQUIRE( out == expected ); + REQUIRE( lokimq::from_hex("12345678ffEDbca9") == "\x12\x34\x56\x78\xff\xed\xbc\xa9"s ); + REQUIRE( lokimq::is_hex("1234567890abcdefABCDEF1234567890abcdefABCDEF") ); REQUIRE_FALSE( lokimq::is_hex("1234567890abcdefABCDEF1234567890aGcdefABCDEF") ); REQUIRE_FALSE( lokimq::is_hex("1234567890abcdefABCDEF1234567890agcdefABCDEF") ); @@ -50,3 +57,72 @@ TEST_CASE("base32z encoding/decoding", "[encoding][decoding][base32z]") { // This one won't round-trip to the same value since it has ignored garbage bytes at the end REQUIRE( lokimq::to_base32z(lokimq::from_base32z("ybndrf4"s)) == "ybndrfa" ); } + +TEST_CASE("base64 encoding/decoding", "[encoding][decoding][base64]") { + // 00000000 00000000 00000000 -> 000000 000000 000000 000000 + REQUIRE( lokimq::to_base64("\0\0\0"s) == "AAAA" ); + // 00000001 00000002 00000003 -> 000000 010000 000200 000003 + REQUIRE( lokimq::to_base64("\x01\x02\x03"s) == "AQID" ); + REQUIRE( lokimq::to_base64("\0\0\0\0"s) == "AAAAAA==" ); + // 00000000 00000000 00000000 11111111 -> + // 000000 000000 000000 000000 111111 110000 (pad) (pad) + REQUIRE( lokimq::to_base64("a") == "YQ==" ); + REQUIRE( lokimq::to_base64("ab") == "YWI=" ); + REQUIRE( lokimq::to_base64("abc") == "YWJj" ); + REQUIRE( lokimq::to_base64("abcd") == "YWJjZA==" ); + REQUIRE( lokimq::to_base64("abcde") == "YWJjZGU=" ); + REQUIRE( lokimq::to_base64("abcdef") == "YWJjZGVm" ); + + REQUIRE( lokimq::to_base64("\0\0\0\xff"s) == "AAAA/w==" ); + REQUIRE( lokimq::to_base64("\0\0\0\xff\xff"s) == "AAAA//8=" ); + REQUIRE( lokimq::to_base64("\0\0\0\xff\xff\xff"s) == "AAAA////" ); + REQUIRE( lokimq::to_base64( + "Man is distinguished, not only by his reason, but by this singular passion from other " + "animals, which is a lust of the mind, that by a perseverance of delight in the " + "continued and indefatigable generation of knowledge, exceeds the short vehemence of " + "any carnal pleasure.") + == + "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0aGlz" + "IHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2Yg" + "dGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGlu" + "dWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdlLCBleGNlZWRzIHRo" + "ZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4=" ); + + REQUIRE( lokimq::from_base64("A+/A") == "\x03\xef\xc0" ); + REQUIRE( lokimq::from_base64("YWJj") == "abc" ); + REQUIRE( lokimq::from_base64("YWJjZA==") == "abcd" ); + REQUIRE( lokimq::from_base64("YWJjZA") == "abcd" ); + REQUIRE( lokimq::from_base64("YWJjZB") == "abcd" ); // ignore superfluous bits + REQUIRE( lokimq::from_base64("YWJjZB") == "abcd" ); // ignore superfluous bits + REQUIRE( lokimq::from_base64("YWJj+") == "abc" ); // ignore superfluous bits + REQUIRE( lokimq::from_base64("YWJjZGU=") == "abcde" ); + REQUIRE( lokimq::from_base64("YWJjZGU") == "abcde" ); + REQUIRE( lokimq::from_base64("YWJjZGVm") == "abcdef" ); + + REQUIRE( lokimq::is_base64("YWJjZGVm") ); + REQUIRE( lokimq::is_base64("YWJjZGU") ); + REQUIRE( lokimq::is_base64("YWJjZGU=") ); + REQUIRE( lokimq::is_base64("YWJjZA==") ); + REQUIRE( lokimq::is_base64("YWJjZA") ); + REQUIRE( lokimq::is_base64("YWJjZB") ); // not really valid, but we explicitly accept it + + REQUIRE_FALSE( lokimq::is_base64("YWJjZ=") ); // invalid padding (padding can only be 4th or 3rd+4th of a 4-char block) + REQUIRE_FALSE( lokimq::is_base64("YWJj=") ); + REQUIRE_FALSE( lokimq::is_base64("YWJj=A") ); + REQUIRE_FALSE( lokimq::is_base64("YWJjA===") ); + REQUIRE_FALSE( lokimq::is_base64("YWJ[") ); + REQUIRE_FALSE( lokimq::is_base64("YWJ.") ); + REQUIRE_FALSE( lokimq::is_base64("_YWJ") ); + + REQUIRE( lokimq::from_base64( + "TWFuIGlzIGRpc3Rpbmd1aXNoZWQsIG5vdCBvbmx5IGJ5IGhpcyByZWFzb24sIGJ1dCBieSB0aGlz" + "IHNpbmd1bGFyIHBhc3Npb24gZnJvbSBvdGhlciBhbmltYWxzLCB3aGljaCBpcyBhIGx1c3Qgb2Yg" + "dGhlIG1pbmQsIHRoYXQgYnkgYSBwZXJzZXZlcmFuY2Ugb2YgZGVsaWdodCBpbiB0aGUgY29udGlu" + "dWVkIGFuZCBpbmRlZmF0aWdhYmxlIGdlbmVyYXRpb24gb2Yga25vd2xlZGdlLCBleGNlZWRzIHRo" + "ZSBzaG9ydCB2ZWhlbWVuY2Ugb2YgYW55IGNhcm5hbCBwbGVhc3VyZS4=" ) + == + "Man is distinguished, not only by his reason, but by this singular passion from other " + "animals, which is a lust of the mind, that by a perseverance of delight in the " + "continued and indefatigable generation of knowledge, exceeds the short vehemence of " + "any carnal pleasure."); +}