aboutsummaryrefslogtreecommitdiff
path: root/archived/utf8toASCII.c
diff options
context:
space:
mode:
authorRaymaekers Luca <luca@spacehb.net>2025-04-27 12:52:06 +0200
committerRaymaekers Luca <luca@spacehb.net>2025-04-27 13:05:34 +0200
commitf87f7b4f0aaccc65d03ccee5bb11915ead6fb0e1 (patch)
treed54df0bfde3dbffa02b1f138af4f12456f261e54 /archived/utf8toASCII.c
parent0574f5a7c5159a2ae1d7d2182cec982509947db9 (diff)
First pass at preparing for Github
Diffstat (limited to 'archived/utf8toASCII.c')
-rw-r--r--archived/utf8toASCII.c163
1 files changed, 0 insertions, 163 deletions
diff --git a/archived/utf8toASCII.c b/archived/utf8toASCII.c
deleted file mode 100644
index 988a69b..0000000
--- a/archived/utf8toASCII.c
+++ /dev/null
@@ -1,163 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdio.h>
-#include <locale.h>
-#include <assert.h>
-#include <wchar.h>
-#include <stdint.h>
-
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-typedef int8_t s8;
-typedef int16_t s16;
-typedef int32_t s32;
-typedef int64_t s64;
-
-static size_t
-UTF8Compress(size_t InSize, wchar_t* In, size_t OutSize, u8* OutBase)
-{
- wchar_t* InEnd = (wchar_t*)((u8*)In + InSize);
- u8* Out = OutBase;
-
-#define MAX_LITERAL_COUNT 255
- u8 ASCIILiterals[MAX_LITERAL_COUNT] = {0};
- u8 ASCIILiteralsCount = 0;
- wchar_t UTF8Literals[MAX_LITERAL_COUNT] = {0};
- u8 UTF8LiteralsCount = 0;
-
- while (In < InEnd)
- {
- wchar_t CurrentChar = In[0];
-
- // Check consecutive ascii characters
- while(CurrentChar == (u8)CurrentChar &&
- ASCIILiteralsCount < MAX_LITERAL_COUNT)
- {
- ASCIILiterals[ASCIILiteralsCount++] = (u8)CurrentChar;
- CurrentChar = *++In;
- }
-
- while(CurrentChar != (u8)CurrentChar &&
- UTF8LiteralsCount < MAX_LITERAL_COUNT)
- {
- UTF8Literals[UTF8LiteralsCount++] = CurrentChar;
- CurrentChar = *++In;
- }
-
- // Encode ASCII/UTF8 pair
- *Out++ = ASCIILiteralsCount;
- for (u8 ch = 0;
- ch < ASCIILiteralsCount;
- ch++)
- {
- *Out = ASCIILiterals[ch];
- Out += sizeof(ASCIILiterals[ch]);
- }
- ASCIILiteralsCount = 0;
-
- *Out++ = UTF8LiteralsCount;
- for (u8 ch = 0;
- ch < UTF8LiteralsCount;
- ch++)
- {
- *(wchar_t*)Out = UTF8Literals[ch];
- Out += sizeof(UTF8Literals[ch]);
- }
- UTF8LiteralsCount = 0;
-
- }
-#undef MAX_LITERAL_COUNT
- assert(In == InEnd);
-
- return Out - OutBase;
-}
-
-static void
-PrintCompressedUTF8(u8* In, size_t InSize)
-{
- u8* InEnd = In + InSize;
-
- while (In < InEnd)
- {
- u8 ASCIICount = *In++;
- wprintf(L"%dA(\"", ASCIICount);
- while(ASCIICount--)
- {
- wprintf(L"%c", *In);
- In += sizeof(u8);
- }
- wprintf(L"\") ");
-
- u8 UTF8Count = *In++;
- wprintf(L"%dU(\"", UTF8Count);
- while(UTF8Count--)
- {
- wprintf(L"%lc", *(wchar_t*)In);
- In += sizeof(wchar_t);
- }
- wprintf(L"\") ");
- }
- wprintf(L"\n");
-
- assert(In == InEnd);
-}
-
-static void
-UTF8Decompress(size_t InSize, u8* In, size_t OutSize, wchar_t* Out)
-{
- u8* InEnd = In + InSize;
-
- while (In < InEnd)
- {
- u8 ASCIICount = *In++;
- while(ASCIICount--)
- {
- *Out++ = *In++;
- }
-
- u8 UTF8Count = *In++;
- while(UTF8Count--)
- {
- *Out++ = *(wchar_t*)In;
- In += sizeof(wchar_t);
- }
- }
- assert(In == InEnd);
-}
-
-// Size is the size of the UTF8 string in bytes. "aaa" would be 12.
-size_t
-UTF8GetMaximumCompressedSize(size_t Size)
-{
- // The largest would be if there was only one unicode point in which case we store 0 for ascii 1
- // for unicode and the raw codepoint. 1 + 1 + 4 * CodepointNum
- return Size + 2;
-}
-
-int
-main(int Argc, char* Argv[]) {
- assert(setlocale(LC_ALL, "") != 0);
-
- wchar_t* InBuf = L"text│tt│";
- size_t InSize = wcslen(InBuf) * 4;
-
- size_t OutSize = UTF8GetMaximumCompressedSize(InSize);
- u8 OutBuf[OutSize];
-
- size_t CompressedSize = UTF8Compress(InSize, InBuf, OutSize, OutBuf);
-
- fwprintf(stderr, L"Raw string: \"%ls\"\n", InBuf);
- fwprintf(stderr, L"Compressed %lu bytes -> %lu bytes.\n", InSize, CompressedSize);
-
- size_t DecompressedSize = InSize;
- wchar_t *DecompressedBuffer = malloc(DecompressedSize);
-
- UTF8Decompress(CompressedSize, OutBuf, DecompressedSize, DecompressedBuffer);
- fwprintf(stderr, L"Decompressed: \"%ls\"\n", DecompressedBuffer);
-
- PrintCompressedUTF8(OutBuf, CompressedSize);
-
- return 0;
-}