* conversion will try to fullfill Target when whole Source is read

This commit is contained in:
Vovanium 2023-04-25 22:55:24 +03:00
parent 5707762d70
commit d37628b673
8 changed files with 33 additions and 9 deletions

View file

@ -6,6 +6,7 @@ abstract project Common is
"-gnato",
"-O2",
"-ggdb",
"-gnata",
"-gnatW8",
"-gnatyAabefikltx"
);

View file

@ -18,11 +18,12 @@ package body Encodings.Unicode.UTF_16.From_UTF_32 is
begin
Source_Last := Source'First - 1;
Target_Last := Target'First - 1;
while Source_Last < Source'Last and Target_Last < Target'Last loop
Main : while Target_Last < Target'Last loop
if State.Surrogate in Low_Surrogate then
Q := State.Surrogate;
State.Surrogate := UTF_16_Character'Val (0);
else
exit Main when Source_Last >= Source'Last;
Source_Last := Source_Last + 1;
Code := UTF_32_Character'Pos (Source (Source_Last));
if Code <= 16#FFFF# then -- BMP
@ -34,7 +35,7 @@ package body Encodings.Unicode.UTF_16.From_UTF_32 is
end if;
Target_Last := Target_Last + 1;
Target (Target_Last) := Q;
end loop;
end loop Main;
end Process;
end Encodings.Unicode.UTF_16.From_UTF_32;

View file

@ -15,9 +15,15 @@ private
use Interfaces;
function Make_High_Surrogate (C : Code_Point) return UTF_16_Character
is (UTF_16_Character'Val (Shift_Right (C - 16#010000#, 10) or 16#D800#));
is (UTF_16_Character'Val (Shift_Right (C - 16#010000#, 10) or 16#D800#))
with
Pre => C not in Basic_Multilingual_Plane,
Post => Make_High_Surrogate'Result in High_Surrogate;
function Make_Low_Surrogate (C : Code_Point) return UTF_16_Character
is (UTF_16_Character'Val ((C and 16#03FF#) or 16#DC00#));
is (UTF_16_Character'Val ((C and 16#03FF#) or 16#DC00#))
with
Pre => C not in Basic_Multilingual_Plane,
Post => Make_Low_Surrogate'Result in Low_Surrogate;
end Encodings.Unicode.UTF_16;

View file

@ -1,5 +1,7 @@
package Encodings.Unicode.UTF_32 is
subtype UTF_32_Character is Wide_Wide_Character
range Wide_Wide_Character'Val (0) .. Wide_Wide_Character'Val (Code_Point_Last);
subtype UTF_32_String is Wide_Wide_String;
end Encodings.Unicode.UTF_32;

View file

@ -19,11 +19,12 @@ package body Encodings.Unicode.UTF_8.From_UTF_32 is
begin
Source_Last := Source'First - 1;
Target_Last := Target'First - 1;
while Source_Last < Source'Last and Target_Last < Target'Last loop
Main : while Target_Last < Target'Last loop
if State.L > 0 then
State.L := State.L - 1;
Q := Make_Extension_Byte (State.Code, State.L);
else
exit Main when Source_Last >= Source'Last;
Source_Last := Source_Last + 1;
C := Source (Source_Last);
case UTF_32_Character'Pos (C) is
@ -47,7 +48,7 @@ package body Encodings.Unicode.UTF_8.From_UTF_32 is
end if;
Target_Last := Target_Last + 1;
Target (Target_Last) := Q;
end loop;
end loop Main;
end Process;
end Encodings.Unicode.UTF_8.From_UTF_32;

View file

@ -33,7 +33,10 @@ private
P : Code_Point;
R : Natural) -- Remaining bytes (index from the last)
return Extension_Byte
is (UTF_8_Character'Val ((Interfaces.Shift_Right (P, 6 * R) and Extension_Byte_Mask) or Extension_Byte_Base));
is (UTF_8_Character'Val ((Interfaces.Shift_Right (P, 6 * R) and Extension_Byte_Mask)
or Extension_Byte_Base))
with
Pre => P not in 0 .. 16#7F# and R < 3;
function Initial_Mask (L : Positive) return Code_Point
is (Shift_Left (1, 6 - L) - 1);
@ -45,7 +48,10 @@ private
P : Code_Point;
L : Positive) -- Extension bytes
return UTF_8_Character
is (UTF_8_Character'Val (Shift_Right (P, 6 * L) or Initial_Base (L)));
is (UTF_8_Character'Val (Shift_Right (P, 6 * L) or Initial_Base (L)))
with
Pre => P not in 0 .. 16#7F# and L < 4,
Post => Make_Initial_Byte'Result in Initial_2_Byte | Initial_3_Byte | Initial_4_Byte;
-- Masking is not needed here
end Encodings.Unicode.UTF_8;

View file

@ -10,6 +10,9 @@ package Encodings.Unicode with Pure is
private
subtype Code_Point is Interfaces.Unsigned_32 range 0 .. Code_Point_Last;
subtype Basic_Multilingual_Plane is Code_Point
range 0 .. 16#FFFF#;
subtype Sequence_Count is Integer range 0 .. 4;
end Encodings.Unicode;

View file

@ -113,12 +113,16 @@ package body Encodings.Unicode.Converter_Test is
Hex'Pos (H (H'Last)) + 16 * UC (H (H'First .. H'Last - 1)));
function U (H : Hex_String) return UTF_8.UTF_8_Character is (UTF_8.UTF_8_Character'Val (UC (H)));
function U (H : Hex_String) return UTF_16.UTF_16_Character is (UTF_16.UTF_16_Character'Val (UC (H)));
function U (H : Hex_String) return UTF_32.UTF_32_Character is (UTF_32.UTF_32_Character'Val (UC (H)));
procedure Run_Test (T : in out Test) is
begin
Test_One (T, "", "", "");
Test_One (T, "abc", "abc", "abc");
Test_One (T, U("D0") & U("A9"), "Щ", "Щ");
Test_One (T, U ("D0") & U ("A9"), "Щ", "Щ");
Test_One (T, U ("E2") & U ("89") & U ("A0"), "≠", "≠");
Test_One (T, U ("F0") & U ("9F") & U ("90") & U ("B1"), U ("D83D") & U ("DC31"), "" & U ("01F431")); -- cat face
end Run_Test;
function Suite return Access_Test_Suite is