diff options
| author | Raymaekers Luca <raymaekers.luca@gmail.com> | 2024-11-23 03:34:03 +0100 | 
|---|---|---|
| committer | Raymaekers Luca <raymaekers.luca@gmail.com> | 2024-11-23 03:34:03 +0100 | 
| commit | 3060303b36dc23f08d923a6bcb15129ff7602864 (patch) | |
| tree | 229095bf8ba575d88c6dd0315b8ba2d4803cea2d | |
| parent | d01caf679f6f819a501687278d75be110646d6ed (diff) | |
checkpoint
| -rw-r--r-- | utf8toASCII.c | 163 | ||||
| -rw-r--r-- | wrap.c | 52 | 
2 files changed, 215 insertions, 0 deletions
| diff --git a/utf8toASCII.c b/utf8toASCII.c new file mode 100644 index 0000000..988a69b --- /dev/null +++ b/utf8toASCII.c @@ -0,0 +1,163 @@ +#include <stdlib.h> +#include <stdio.h> +#include <stdio.h> +#include <locale.h> +#include <assert.h> +#include <wchar.h> +#include <stdint.h> + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +static size_t +UTF8Compress(size_t InSize, wchar_t* In, size_t OutSize, u8* OutBase) +{ +    wchar_t* InEnd = (wchar_t*)((u8*)In + InSize); +    u8* Out = OutBase; + +#define MAX_LITERAL_COUNT 255 +    u8 ASCIILiterals[MAX_LITERAL_COUNT] = {0}; +    u8 ASCIILiteralsCount = 0; +    wchar_t UTF8Literals[MAX_LITERAL_COUNT] = {0}; +    u8 UTF8LiteralsCount = 0; + +    while (In < InEnd) +    { +        wchar_t CurrentChar = In[0]; + +        // Check consecutive ascii characters +        while(CurrentChar == (u8)CurrentChar && +              ASCIILiteralsCount < MAX_LITERAL_COUNT) +        { +            ASCIILiterals[ASCIILiteralsCount++] = (u8)CurrentChar; +            CurrentChar = *++In; +        } + +        while(CurrentChar != (u8)CurrentChar && +              UTF8LiteralsCount < MAX_LITERAL_COUNT) +        { +            UTF8Literals[UTF8LiteralsCount++] = CurrentChar; +            CurrentChar = *++In; +        } + +        // Encode ASCII/UTF8 pair +        *Out++ = ASCIILiteralsCount; +        for (u8 ch = 0; +             ch < ASCIILiteralsCount; +             ch++) +        { +            *Out = ASCIILiterals[ch]; +            Out += sizeof(ASCIILiterals[ch]); +        } +        ASCIILiteralsCount = 0; + +        *Out++ = UTF8LiteralsCount; +        for (u8 ch = 0; +             ch < UTF8LiteralsCount; +             ch++) +        { +            *(wchar_t*)Out = UTF8Literals[ch]; +            Out += sizeof(UTF8Literals[ch]); +        } +        UTF8LiteralsCount = 0; + +    } +#undef MAX_LITERAL_COUNT +    assert(In == InEnd); + +    return Out - OutBase; +} + +static void +PrintCompressedUTF8(u8* In, size_t InSize) +{ +    u8* InEnd = In + InSize; + +    while (In < InEnd) +    { +        u8 ASCIICount = *In++; +        wprintf(L"%dA(\"", ASCIICount); +        while(ASCIICount--) +        { +            wprintf(L"%c", *In); +            In += sizeof(u8); +        } +        wprintf(L"\") "); + +        u8 UTF8Count = *In++; +        wprintf(L"%dU(\"", UTF8Count); +        while(UTF8Count--) +        { +            wprintf(L"%lc", *(wchar_t*)In); +            In += sizeof(wchar_t); +        } +        wprintf(L"\") "); +    } +    wprintf(L"\n"); + +    assert(In == InEnd); +} + +static void +UTF8Decompress(size_t InSize, u8* In, size_t OutSize, wchar_t* Out) +{ +    u8* InEnd = In + InSize; + +    while (In < InEnd) +    { +        u8 ASCIICount = *In++; +        while(ASCIICount--) +        { +            *Out++ = *In++; +        } + +        u8 UTF8Count = *In++; +        while(UTF8Count--) +        { +            *Out++ = *(wchar_t*)In; +            In += sizeof(wchar_t); +        } +    } +    assert(In == InEnd); +} + +// Size is the size of the UTF8 string in bytes. "aaa" would be 12. +size_t +UTF8GetMaximumCompressedSize(size_t Size) +{ +    // The largest would be if there was only one unicode point in which case we store 0 for ascii 1 +    // for unicode and the raw codepoint. 1 + 1 + 4 * CodepointNum +    return Size + 2; +} + +int +main(int Argc, char* Argv[]) { +    assert(setlocale(LC_ALL, "") != 0); + +    wchar_t* InBuf = L"text│tt│"; +    size_t InSize = wcslen(InBuf) * 4; + +    size_t OutSize = UTF8GetMaximumCompressedSize(InSize); +    u8 OutBuf[OutSize]; + +    size_t CompressedSize = UTF8Compress(InSize, InBuf, OutSize, OutBuf); + +    fwprintf(stderr, L"Raw string: \"%ls\"\n", InBuf); +    fwprintf(stderr, L"Compressed %lu bytes -> %lu bytes.\n", InSize, CompressedSize); + +    size_t DecompressedSize = InSize; +    wchar_t *DecompressedBuffer = malloc(DecompressedSize); + +    UTF8Decompress(CompressedSize, OutBuf, DecompressedSize, DecompressedBuffer); +    fwprintf(stderr, L"Decompressed: \"%ls\"\n", DecompressedBuffer); + +    PrintCompressedUTF8(OutBuf, CompressedSize); + +    return 0; +} @@ -0,0 +1,52 @@ +// 1. Search backwards for whitespace +// - found? +//   y) wrap +//   n) break at limit +// - end? +//   y) terminate +//   n) goto 1. with offset += limit +void +wrap(u8* Text, u32 Len, u32 XLimit, u32 YLimit) +{ +    u32 SearchingOffset = XLimit; +    u32 X = SearchingOffset; +    u32 Y = 0; +    u8 t; +    u32 PrevX = 0; + +    while (X < Len) +    { +        // Search for whitespace to break on +        while (1) +        { +            if (is_whitespace(Text[X])) break; + +            X--; + +            // if we got back to the previous position break on Text[SearchingOffset] +            if (X == PrevX) +            { +                X = XLimit; +                break; +            } +        } + +        // break +        t = Text[X]; +        *(Text + X) = '\0'; +        tb_printf(0, Y, 0, 0, "%s", Text + PrevX); +        Text[X] = t; +        Y++; +        if (Y >= YLimit) break; + +        // consume leading whitespace +        while (is_whitespace(Text[X])) X++; + +        PrevX = X; +        X += XLimit; +    } + +    tb_printf(0, Y, 0, 0, "%s", Text + PrevX); + +    return; +} | 
