main.cpp

/*
 * File:   main.cpp
 * Author: Saleem Edah-Tally - nmset@yandex.com
 * License: CeCILL-C
 * Copyright: Saleem Edah-Tally - © 2023
 *
 * Created on 20 september 2023, 18:31
 */

#include <iostream>
#include <getopt.h>
#include <utf8proc.h>
#include <format>
#include <map>

using namespace std;

#define STRIP_OPTIONS_DEFAULT (UTF8PROC_IGNORE | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK | UTF8PROC_STRIPNA | UTF8PROC_DECOMPOSE | UTF8PROC_STABLE | UTF8PROC_NULLTERM)

typedef map<int, string> KeyValuePair;
// Described in utf8proc.h.
KeyValuePair categoryDescription;
KeyValuePair bidirectional;
KeyValuePair decompositionType;
KeyValuePair boundClass;

string valueRepresentation(long nb, int baseHint) {
    // https://en.cppreference.com/w/cpp/utility/format/formatter
    string formatted;
    switch (baseHint)
    {
        case 2:
            formatted = std::format("{:08b}", nb);
            break;
        case 8:
            formatted = std::format("{}{:03o}", "\\", nb);
            break;
        case 10:
            formatted = std::format("{:d}", nb);
            break;
        case 16:
            formatted = std::format("{}{:0X}","0x", nb);
            break;
        case 17: // enforce 4-byte representation for UTF-16, even if codepoint < 0xFFFF.
            formatted = std::format("{}{:04X}","0x", nb);
            break;
        case 'U':
            formatted = std::format("{}{:04X}","U+", nb);
            break;
        case 'x':
            formatted = std::format("{}{:d}{}","&#", nb, ";");
            break;
        default:
            cout << "Unhandled base: " << baseHint << endl;
            return "";
    }
    
    return formatted;
}

void unaccentShowHelp()
{
    string message("This operational mode removes character markings, control characters, default ignorable characters and unassigned codepoints from an UTF-8 input."
    "The utf8proc library, on which this utility is based, refers to character markings as 'non-spacing, spacing and enclosing (accents)' marks, and to default ignorable characters 'such as SOFT-HYPHEN or ZERO-WIDTH-SPACE'. Control characters are stripped or converted to spaces."
    "\n\nBy default, every removable byte is stripped, the output characters are decomposed and Unicode Versioning Stability is enforced."
    "\n\n  -i, --ignore: do not strip 'default ignorable characters'"
    "\n  -c, --control: do not handle 'control characters'"
    "\n  -m, --mark: do not strip 'character markings'"
    "\n  -n, --na: do not strip 'unassigned codepoints'"
    "\n  -r, --recompose: output recomposed characters"
    "\n  -h, --help: show this message"
    "\n\nThe input can be piped in or read from stdin. It must be a single NULL terminated line.");
    
    cout << message << endl;
}

void normalizeShowHelp()
{
    string message("This operational mode normalizes the input string according to the specified type, the default being NFC."
    "\n\n  -t, --type: one of NFC, NFD, NFKC, NFKD, NFKC_Casefold"
    "\n  -h, --help: show this message"
    "\n\nThe input can be piped in or read from stdin. It must be a single NULL terminated line.");
    
    cout << message << endl;
}

void representationShowHelp()
{
    string message("This operational mode displays representations of the first identified codepoint."
    "\n\n  -p, --codepoint: hexadecimal representation of the codepoint"
    "\n  -e, --utf8: hexadecimal representation of each byte"
    "\n  -s, --utf16: hexadecimal representation of each surrogate"
    "\n  -b, --binary: binary representation of each byte"
    "\n  -o, --octal: octal representation of each byte"
    "\n  -d, --decimal: decimal representation of each byte"
    "\n  -x, --xml: XML decimal representation of each byte"
    "\n  -L, --tolower: displays the codepoint as a lower-case character if existent"
    "\n  -U, --toupper: displays the codepoint as an upper-case character if existent"
    "\n  -T, --totitle: displays the codepoint as a title-case character if existent"
    "\n  -h, --help: show this message"
    "\n\nThe input can be piped in or read from stdin. Pass in a single character for simplicity.");
    
    cout << message << endl;
}

int unaccent(int argc, char **argv) {
    string input;
    int options  = STRIP_OPTIONS_DEFAULT;
    
    // Use : --longopt=<val> -s <val>
    option longopts[] = {
        {"ignore", no_argument, 0, 'i'}, 
        {"control", no_argument, 0, 'c'}, 
        {"mark", no_argument, 0, 'm'},
        {"na", no_argument, 0, 'n'},
        {"recompose", no_argument, 0, 'r'},
        {"help", no_argument, 0, 'h'},
        {0}};
        
    while (1) {
        const int opt = getopt_long(argc, argv, "icmnrh", longopts, 0);
        
        if (opt == -1) {
            break;
        }
        
        switch (opt) {
            case 'i':
                options ^= UTF8PROC_IGNORE;
                break;
            case 'c':
                options ^= UTF8PROC_STRIPCC;
                break;
            case 'm':
                options ^= UTF8PROC_STRIPMARK;
                break;
            case 'n':
                options ^= UTF8PROC_STRIPNA;
                break;
            case 'r':
                // Interestingly, UTF8PROC_COMPOSE gives the same result as UTF8PROC_DECOMPOSE.
                // Probably the options mean : decompose, strip, compose ?
                options ^= UTF8PROC_DECOMPOSE;
                options |= UTF8PROC_COMPOSE;
                break;
            case 'h':
                unaccentShowHelp();
                return 0;
            case '?':
                return 30; // -5 to -1 are reserved by utf8proc; their absolute values are used here.
            default:
                options  = STRIP_OPTIONS_DEFAULT;
                break;
        }
    }
    
    //string fragment;
    // while (cin >> fragment)
    //     input += fragment + " ";
    // input.pop_back();
    std::getline(cin, input);
    utf8proc_uint8_t * result;
    utf8proc_ssize_t nb = utf8proc_map((const utf8proc_uint8_t *) input.data(),
                                        0, // Without UTF8PROC_NULLTERM, is number of bytes to process from input.
                                        &result,
                                        utf8proc_option_t (options)
    );
    if (nb < 0) // an error occured
    {
        cout << utf8proc_errmsg(nb) << endl;
        if (result)
            free((void*) result);
        return nb * -1;
    }
    
    cout << (const char*) result << endl;
    free((void*) result);
    
    return 0;
}

int normalize(int argc, char ** argv)
{
    string input;
    string type("NFC");
    
    // Use : --longopt=<val> -s <val>
    option longopts[] = {
        {"type", required_argument, 0, 't'}, 
        {"help", no_argument, 0, 'h'},
        {0}};
    
    while (1) {
        const int opt = getopt_long(argc, argv, "t:h", longopts, 0);
        
        if (opt == -1) {
            break;
        }
        
        switch (opt) {
            case 't':
                type = optarg;
                break;
            case 'h':
                normalizeShowHelp();
                return 0;
            case '?':
                return 40; // -5 to -1 are reserved by utf8proc; their absolute values are used here.
            default:
                type = "NFC";
                break;
        }
    }
    
    std::getline(cin, input);
    utf8proc_uint8_t * result = NULL;
    
    if (type == "NFC")
    {
        result = utf8proc_NFC((const utf8proc_uint8_t*) input.c_str());
    }
    else if (type == "NFD")
    {
        result = utf8proc_NFD((const utf8proc_uint8_t*) input.c_str());
    }
    else if (type == "NFKC")
    {
        result = utf8proc_NFKC((const utf8proc_uint8_t*) input.c_str());
    }
    else if (type == "NFKD")
    {
        result = utf8proc_NFKD((const utf8proc_uint8_t*) input.c_str());
    }
    else if (type == "NFKC_Casefold")
    {
        result = utf8proc_NFKC_Casefold((const utf8proc_uint8_t*) input.c_str());
    }
    else
    {
        cout << "Unknown type; valid types are NFC, NFD, NFKC, NFKD and NFKC_Casefold." << endl;
        return 41;
    }
    
    if (result)
    {
        cout << (const char*) result << endl;
        free((void*) result);
    }
    
    return 0;
}

int representation(int argc, char ** argv)
{
    for (uint i = 0; i < argc; i++)
    {
        string arg = argv[i];
        if (arg == "-h" || arg == "--help")
        {
            representationShowHelp();
            return 0;
        }
    }
    
    utf8proc_int32_t codepoint = 0;
    string input;
    cin >> input;
    const utf8proc_uint8_t * inputArray = (const utf8proc_uint8_t *) input.c_str();
    // This stops at the first codepoint; with 'عَ', the first retrieved codepoint is 'ع'.
    utf8proc_ssize_t nb = utf8proc_iterate(&inputArray[0], -1, &codepoint);
    if (nb < 0)
    {
        cout << utf8proc_errmsg(nb) << endl;
        return nb * -1;
    }
    utf8proc_uint8_t firstCharArray[5];
    utf8proc_ssize_t nbOfBytesInFirstChar = utf8proc_encode_char(codepoint, firstCharArray);
    if (nbOfBytesInFirstChar == 0)
    {
        cout << "No valid bytes at start of input." << endl;
        return 51;
    }
    firstCharArray[nbOfBytesInFirstChar] = '\0';
    
    option longopts[] = {
        {"codepoint", no_argument, 0, 'p'}, 
        {"utf8", no_argument, 0, 'e'},  // 'e'ight
        {"utf16", no_argument, 0, 's'}, // 's'ixteen
        {"binary", no_argument, 0, 'b'},
        {"octal", no_argument, 0, 'o'},
        {"decimal", no_argument, 0, 'd'},
        {"xml", no_argument, 0, 'x'},
        {"tolower", no_argument, 0, 'L'},
        {"toupper", no_argument, 0, 'U'},
        {"totitle", no_argument, 0, 'T'},
        {0}};
        
    while (1) {
        const int opt = getopt_long(argc, argv, "pesbodxLUT", longopts, 0);
        
        if (opt == -1) {
            break;
        }
        
        switch (opt) {
            case 'p':
                cout << "Codepoint: " << valueRepresentation(codepoint, 'U') << endl;
                break;
            case 'e':
                cout << "UTF-8: ";
                for (uint i = 0; i < nbOfBytesInFirstChar; i++)
                {
                    cout << valueRepresentation(firstCharArray[i], 16) << " ";
                }
                cout << endl;
                break;
            case 's':
                // https://en.wikipedia.org/wiki/UTF-16
                cout << "UTF-16: ";
                if (codepoint < 0xFFFF) // 2 bytes only
                {
                    cout << valueRepresentation(codepoint, 17);
                }
                else // 4 bytes
                {
                    utf8proc_int32_t intermediate = codepoint - 0x10000;
                    utf8proc_int32_t shifted = intermediate >> 10; // Divide by 0x400 (1024)(2^10)
                    utf8proc_int32_t highSurrogate = shifted +  0xD800;
                    utf8proc_int32_t lowTenBits = intermediate % 0x400; // Same result with (intermediate & 1023)
                    utf8proc_int32_t lowSurrogate = lowTenBits + 0xDC00;
                    cout << valueRepresentation(highSurrogate, 17) << " ";
                    cout << valueRepresentation(lowSurrogate, 17);
                }
                cout << endl;
                break;
            case 'b':
                cout << "Binary: ";
                for (uint i = 0; i < nbOfBytesInFirstChar; i++)
                {
                    cout << valueRepresentation(firstCharArray[i], 2) << " ";
                }
                cout << endl;
                break;
            case 'o':
                cout << "Octal: ";
                for (uint i = 0; i < nbOfBytesInFirstChar; i++)
                {
                    cout << valueRepresentation(firstCharArray[i], 8) << " ";
                }
                cout << endl;
                break;
            case 'd':
                cout << "Decimal: ";
                for (uint i = 0; i < nbOfBytesInFirstChar; i++)
                {
                    cout << valueRepresentation(firstCharArray[i], 10) << " ";
                }
                cout << endl;
                break;
            case 'x':
                cout << "XML decimal: " << valueRepresentation(codepoint, 'x') << endl;
                break;
            case 'L':
            {
                utf8proc_int32_t lowerCodepoint = utf8proc_tolower(codepoint);
                utf8proc_uint8_t dst[5];
                utf8proc_ssize_t bytesWritten = utf8proc_encode_char(lowerCodepoint, &dst[0]);
                dst[bytesWritten] = '\0';
                cout << "To lower: " << (const char*) dst << endl;
            }
                break;
            case 'U':
            {
                utf8proc_int32_t upperCodepoint = utf8proc_toupper(codepoint);
                utf8proc_uint8_t dst[5];
                utf8proc_ssize_t bytesWritten = utf8proc_encode_char( upperCodepoint, &dst[0]);
                dst[bytesWritten] = '\0';
                cout << "To upper: " << (const char*) dst << endl;
            }
                break;
            case 'T':
            {
                utf8proc_int32_t upperCodepoint = utf8proc_totitle(codepoint);
                utf8proc_uint8_t dst[5];
                utf8proc_ssize_t bytesWritten = utf8proc_encode_char( upperCodepoint, &dst[0]);
                dst[bytesWritten] = '\0';
                cout << "To title: " << (const char*) dst << endl;
            }
                break;
            case 'h':
                representationShowHelp();
                return 0;
            case '?':
                return 50; // -5 to -1 are reserved by utf8proc; their absolute values are used here.
            default:
                break;
        }
    }
    // Show the processed character.
    cout << "Character: " << (const char*) firstCharArray << endl;
    return 0;
}

int main(int argc, char ** argv)
{
    categoryDescription[UTF8PROC_CATEGORY_CN] = "Other, not assigned";
    categoryDescription[UTF8PROC_CATEGORY_LU] = "Letter, uppercase";
    categoryDescription[UTF8PROC_CATEGORY_LL] = "Letter, lowercase";
    categoryDescription[UTF8PROC_CATEGORY_LT] = "Letter, titlecase";
    categoryDescription[UTF8PROC_CATEGORY_LM] = "Letter, modifier";
    categoryDescription[UTF8PROC_CATEGORY_LO] = "Letter, other";
    categoryDescription[UTF8PROC_CATEGORY_MN] = "Mark, nonspacing";
    categoryDescription[UTF8PROC_CATEGORY_MC] = "Mark, spacing combining";
    categoryDescription[UTF8PROC_CATEGORY_ME] = "Mark, enclosing";
    categoryDescription[UTF8PROC_CATEGORY_NL] = "Number, letter";
    categoryDescription[UTF8PROC_CATEGORY_NO] = "Number, other";
    categoryDescription[UTF8PROC_CATEGORY_PC] = "Punctuation, connector";
    categoryDescription[UTF8PROC_CATEGORY_PD] = "Punctuation, dash";
    categoryDescription[UTF8PROC_CATEGORY_PS] = "Punctuation, open";
    categoryDescription[UTF8PROC_CATEGORY_PE] = "Punctuation, close";
    categoryDescription[UTF8PROC_CATEGORY_PI] = "Punctuation, initial quote";
    categoryDescription[UTF8PROC_CATEGORY_PF] = "Punctuation, final quote";
    categoryDescription[UTF8PROC_CATEGORY_PO] = "Punctuation, other";
    categoryDescription[UTF8PROC_CATEGORY_SM] = "Symbol, math";
    categoryDescription[UTF8PROC_CATEGORY_SC] = "Symbol, currency";
    categoryDescription[UTF8PROC_CATEGORY_SK] = "Symbol, modifier";
    categoryDescription[UTF8PROC_CATEGORY_SO] = "Symbol, other";
    categoryDescription[UTF8PROC_CATEGORY_ZS] = "Separator, space";
    categoryDescription[UTF8PROC_CATEGORY_ZL] = "Separator, line";
    categoryDescription[UTF8PROC_CATEGORY_ZP] = "Separator, paragraph";
    categoryDescription[UTF8PROC_CATEGORY_CC] = "Other, control";
    categoryDescription[UTF8PROC_CATEGORY_CF] = "Other, format";
    categoryDescription[UTF8PROC_CATEGORY_CS] = "Other, surrogate";
    categoryDescription[UTF8PROC_CATEGORY_CO] = "Other, private use";
    
    bidirectional[UTF8PROC_BIDI_CLASS_L] = "Left-to-Right";
    bidirectional[UTF8PROC_BIDI_CLASS_LRE] = "Left-to-Right Embedding";
    bidirectional[UTF8PROC_BIDI_CLASS_LRO] = "Left-to-Right Override";
    bidirectional[UTF8PROC_BIDI_CLASS_R] = "Right-to-Left";
    bidirectional[UTF8PROC_BIDI_CLASS_AL] = "Right-to-Left Arabic";
    bidirectional[UTF8PROC_BIDI_CLASS_RLE] = "Right-to-Left Embedding";
    bidirectional[UTF8PROC_BIDI_CLASS_RLO] = "Right-to-Left Override";
    bidirectional[UTF8PROC_BIDI_CLASS_PDF] = "Pop Directional Format";
    bidirectional[UTF8PROC_BIDI_CLASS_EN] = "European Number";
    bidirectional[UTF8PROC_BIDI_CLASS_ES] = "European Separator";
    bidirectional[UTF8PROC_BIDI_CLASS_ET] = "European Number Terminator";
    bidirectional[UTF8PROC_BIDI_CLASS_AN] = "Arabic Number";
    bidirectional[UTF8PROC_BIDI_CLASS_CS] = "Common Number Separator";
    bidirectional[UTF8PROC_BIDI_CLASS_NSM] = "Nonspacing Mark";
    bidirectional[UTF8PROC_BIDI_CLASS_BN] = "Boundary Neutral";
    bidirectional[UTF8PROC_BIDI_CLASS_B] = "Paragraph Separator";
    bidirectional[UTF8PROC_BIDI_CLASS_S] = "Segment Separator";
    bidirectional[UTF8PROC_BIDI_CLASS_WS] = "Whitespace";
    bidirectional[UTF8PROC_BIDI_CLASS_ON] = "Other Neutrals";
    bidirectional[UTF8PROC_BIDI_CLASS_LRI] = "Left-to-Right Isolate";
    bidirectional[UTF8PROC_BIDI_CLASS_RLI] = "Right-to-Left Isolate";
    bidirectional[UTF8PROC_BIDI_CLASS_FSI] = "First Strong Isolate";
    bidirectional[UTF8PROC_BIDI_CLASS_PDI] = "Pop Directional Isolate";
    
    // Whatever it means! But does it concern decomposed form only?
    decompositionType[0] = "Unknown"; // property->decomp_type is 0 on all tested characters, decomposed or not.
    decompositionType[UTF8PROC_DECOMP_TYPE_FONT] = "Font"; // Starts at 1.
    decompositionType[UTF8PROC_DECOMP_TYPE_NOBREAK] = "Nobreak";
    decompositionType[UTF8PROC_DECOMP_TYPE_INITIAL] = "Initial";
    decompositionType[UTF8PROC_DECOMP_TYPE_MEDIAL] = "Medial";
    decompositionType[UTF8PROC_DECOMP_TYPE_FINAL] = "Final";
    decompositionType[UTF8PROC_DECOMP_TYPE_ISOLATED] = "Isolated";
    decompositionType[UTF8PROC_DECOMP_TYPE_CIRCLE] = "Circle";
    decompositionType[UTF8PROC_DECOMP_TYPE_SUPER] = "Super";
    decompositionType[UTF8PROC_DECOMP_TYPE_SUB] = "Sub";
    decompositionType[UTF8PROC_DECOMP_TYPE_VERTICAL] = "Vertical";
    decompositionType[UTF8PROC_DECOMP_TYPE_WIDE] = "Wide";
    decompositionType[UTF8PROC_DECOMP_TYPE_NARROW] = "Narrow";
    decompositionType[UTF8PROC_DECOMP_TYPE_SMALL] = "Small";
    decompositionType[UTF8PROC_DECOMP_TYPE_SQUARE] = "Square";
    decompositionType[UTF8PROC_DECOMP_TYPE_FRACTION] = "Fraction";
    decompositionType[UTF8PROC_DECOMP_TYPE_COMPAT] = "Compat";
    
    // Whatever most values mean!
    boundClass[UTF8PROC_BOUNDCLASS_START] =  "Start";
    boundClass[UTF8PROC_BOUNDCLASS_OTHER] =  "Other";
    boundClass[UTF8PROC_BOUNDCLASS_CR] =  "Cr";
    boundClass[UTF8PROC_BOUNDCLASS_LF] =  "Lf";
    boundClass[UTF8PROC_BOUNDCLASS_CONTROL] =  "Control";
    boundClass[UTF8PROC_BOUNDCLASS_EXTEND] =  "Extend";
    boundClass[UTF8PROC_BOUNDCLASS_L] =  "L";
    boundClass[UTF8PROC_BOUNDCLASS_V] =  "V";
    boundClass[UTF8PROC_BOUNDCLASS_T] =  "T";
    boundClass[UTF8PROC_BOUNDCLASS_LV] =  "Lv";
    boundClass[UTF8PROC_BOUNDCLASS_LVT] = "Lvt";
    boundClass[UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR] = "Regional indicator";
    boundClass[UTF8PROC_BOUNDCLASS_SPACINGMARK] = "Spacingmark";
    boundClass[UTF8PROC_BOUNDCLASS_PREPEND] = "Prepend";
    boundClass[UTF8PROC_BOUNDCLASS_ZWJ] = "Zero Width Joiner";
    boundClass[UTF8PROC_BOUNDCLASS_E_BASE] = "Emoji Base";
    boundClass[UTF8PROC_BOUNDCLASS_E_MODIFIER] = "Emoji Modifier";
    boundClass[UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ] = "Glue_After_ZWJ";
    boundClass[UTF8PROC_BOUNDCLASS_E_BASE_GAZ] = "E_BASE + GLUE_AFTER_ZJW";
    boundClass[UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC] = "Extended_Pictographic",
    boundClass[UTF8PROC_BOUNDCLASS_E_ZWG] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ";
    
    const char * modeInfo = "A mode of operation is required: unaccent, normalize, representation."
     "\nPass '--help' for more information in each mode.";
    const int sargc = argc - 1;
    if (sargc == 0)
    {
        cout << modeInfo << endl;
        return 20;
    }
    char * sargv[sargc];
    for (uint i = 0; i < argc; i++)
    {
        if (i < 1)
        {
            sargv[i] = argv[i];
            continue;
        }
        if (i == 1)
            continue;
        sargv[i - 1] = argv[i];
    }
    
    int ret = 0;
    const string mode(argv[1]);
    if (mode == "unaccent")
    {
        ret = unaccent(sargc, sargv);
    }
    else if (mode == "normalize")
    {
        ret = normalize(sargc, sargv);
    }
    else if (mode == "representation")
    {
        ret = representation (sargc, sargv);
    }
    else
    {
        cout << modeInfo << endl;
        return 20;
    }
    
    return ret;
}
Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`/*`
			`* File: main.cpp`
			`* Author: Saleem Edah-Tally - nmset@yandex.com`
			`* License: CeCILL-C`
			`* Copyright: Saleem Edah-Tally - © 2023`
			`*`
			`* Created on 20 september 2023, 18:31`
			`*/`

			`#include <iostream>`
			`#include <getopt.h>`
			`#include <utf8proc.h>`
Add representation mode. 2023-10-02 14:21:26 +02:00			`#include <format>`
			`#include <map>`
Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00
			`using namespace std;`

			`#define STRIP_OPTIONS_DEFAULT (UTF8PROC_IGNORE \| UTF8PROC_STRIPCC \| UTF8PROC_STRIPMARK \| UTF8PROC_STRIPNA \| UTF8PROC_DECOMPOSE \| UTF8PROC_STABLE \| UTF8PROC_NULLTERM)`

Add representation mode. 2023-10-02 14:21:26 +02:00			`typedef map<int, string> KeyValuePair;`
			`// Described in utf8proc.h.`
			`KeyValuePair categoryDescription;`
			`KeyValuePair bidirectional;`
			`KeyValuePair decompositionType;`
			`KeyValuePair boundClass;`

			`string valueRepresentation(long nb, int baseHint) {`
			`// https://en.cppreference.com/w/cpp/utility/format/formatter`
			`string formatted;`
			`switch (baseHint)`
			`{`
			`case 2:`
			`formatted = std::format("{:08b}", nb);`
			`break;`
			`case 8:`
			`formatted = std::format("{}{:03o}", "\\", nb);`
			`break;`
			`case 10:`
			`formatted = std::format("{:d}", nb);`
			`break;`
			`case 16:`
			`formatted = std::format("{}{:0X}","0x", nb);`
			`break;`
			`case 17: // enforce 4-byte representation for UTF-16, even if codepoint < 0xFFFF.`
			`formatted = std::format("{}{:04X}","0x", nb);`
			`break;`
			`case 'U':`
			`formatted = std::format("{}{:04X}","U+", nb);`
			`break;`
			`case 'x':`
			`formatted = std::format("{}{:d}{}","&#", nb, ";");`
			`break;`
			`default:`
			`cout << "Unhandled base: " << baseHint << endl;`
			`return "";`
			`}`

			`return formatted;`
			`}`

Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`void unaccentShowHelp()`
			`{`
			`string message("This operational mode removes character markings, control characters, default ignorable characters and unassigned codepoints from an UTF-8 input."`
			`"The utf8proc library, on which this utility is based, refers to character markings as 'non-spacing, spacing and enclosing (accents)' marks, and to default ignorable characters 'such as SOFT-HYPHEN or ZERO-WIDTH-SPACE'. Control characters are stripped or converted to spaces."`
			`"\n\nBy default, every removable byte is stripped, the output characters are decomposed and Unicode Versioning Stability is enforced."`
			`"\n\n -i, --ignore: do not strip 'default ignorable characters'"`
			`"\n -c, --control: do not handle 'control characters'"`
			`"\n -m, --mark: do not strip 'character markings'"`
			`"\n -n, --na: do not strip 'unassigned codepoints'"`
			`"\n -r, --recompose: output recomposed characters"`
			`"\n -h, --help: show this message"`
			`"\n\nThe input can be piped in or read from stdin. It must be a single NULL terminated line.");`

			`cout << message << endl;`
			`}`

			`void normalizeShowHelp()`
			`{`
			`string message("This operational mode normalizes the input string according to the specified type, the default being NFC."`
			`"\n\n -t, --type: one of NFC, NFD, NFKC, NFKD, NFKC_Casefold"`
			`"\n -h, --help: show this message"`
			`"\n\nThe input can be piped in or read from stdin. It must be a single NULL terminated line.");`

			`cout << message << endl;`
			`}`

Add representation mode. 2023-10-02 14:21:26 +02:00			`void representationShowHelp()`
			`{`
			`string message("This operational mode displays representations of the first identified codepoint."`
			`"\n\n -p, --codepoint: hexadecimal representation of the codepoint"`
			`"\n -e, --utf8: hexadecimal representation of each byte"`
			`"\n -s, --utf16: hexadecimal representation of each surrogate"`
			`"\n -b, --binary: binary representation of each byte"`
			`"\n -o, --octal: octal representation of each byte"`
			`"\n -d, --decimal: decimal representation of each byte"`
			`"\n -x, --xml: XML decimal representation of each byte"`
			`"\n -L, --tolower: displays the codepoint as a lower-case character if existent"`
			`"\n -U, --toupper: displays the codepoint as an upper-case character if existent"`
			`"\n -T, --totitle: displays the codepoint as a title-case character if existent"`
			`"\n -h, --help: show this message"`
			`"\n\nThe input can be piped in or read from stdin. Pass in a single character for simplicity.");`

			`cout << message << endl;`
			`}`

Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`int unaccent(int argc, char **argv) {`
			`string input;`
			`int options = STRIP_OPTIONS_DEFAULT;`

			`// Use : --longopt=<val> -s <val>`
			`option longopts[] = {`
			`{"ignore", no_argument, 0, 'i'},`
			`{"control", no_argument, 0, 'c'},`
			`{"mark", no_argument, 0, 'm'},`
			`{"na", no_argument, 0, 'n'},`
			`{"recompose", no_argument, 0, 'r'},`
			`{"help", no_argument, 0, 'h'},`
			`{0}};`

			`while (1) {`
			`const int opt = getopt_long(argc, argv, "icmnrh", longopts, 0);`

			`if (opt == -1) {`
			`break;`
			`}`

			`switch (opt) {`
			`case 'i':`
			`options ^= UTF8PROC_IGNORE;`
			`break;`
			`case 'c':`
			`options ^= UTF8PROC_STRIPCC;`
			`break;`
			`case 'm':`
			`options ^= UTF8PROC_STRIPMARK;`
			`break;`
			`case 'n':`
			`options ^= UTF8PROC_STRIPNA;`
			`break;`
			`case 'r':`
			`// Interestingly, UTF8PROC_COMPOSE gives the same result as UTF8PROC_DECOMPOSE.`
			`// Probably the options mean : decompose, strip, compose ?`
			`options ^= UTF8PROC_DECOMPOSE;`
			`options \|= UTF8PROC_COMPOSE;`
			`break;`
			`case 'h':`
			`unaccentShowHelp();`
			`return 0;`
			`case '?':`
			`return 30; // -5 to -1 are reserved by utf8proc; their absolute values are used here.`
			`default:`
			`options = STRIP_OPTIONS_DEFAULT;`
			`break;`
			`}`
			`}`

			`//string fragment;`
			`// while (cin >> fragment)`
			`// input += fragment + " ";`
			`// input.pop_back();`
			`std::getline(cin, input);`
			`utf8proc_uint8_t * result;`
			`utf8proc_ssize_t nb = utf8proc_map((const utf8proc_uint8_t *) input.data(),`
			`0, // Without UTF8PROC_NULLTERM, is number of bytes to process from input.`
			`&result,`
			`utf8proc_option_t (options)`
			`);`
			`if (nb < 0) // an error occured`
			`{`
			`cout << utf8proc_errmsg(nb) << endl;`
			`if (result)`
			`free((void*) result);`
			`return nb * -1;`
			`}`

			`cout << (const char*) result << endl;`
			`free((void*) result);`

			`return 0;`
			`}`

			`int normalize(int argc, char ** argv)`
			`{`
			`string input;`
			`string type("NFC");`

			`// Use : --longopt=<val> -s <val>`
			`option longopts[] = {`
			`{"type", required_argument, 0, 't'},`
			`{"help", no_argument, 0, 'h'},`
			`{0}};`

			`while (1) {`
			`const int opt = getopt_long(argc, argv, "t:h", longopts, 0);`

			`if (opt == -1) {`
			`break;`
			`}`

			`switch (opt) {`
			`case 't':`
			`type = optarg;`
			`break;`
			`case 'h':`
			`normalizeShowHelp();`
			`return 0;`
			`case '?':`
			`return 40; // -5 to -1 are reserved by utf8proc; their absolute values are used here.`
			`default:`
			`type = "NFC";`
			`break;`
			`}`
			`}`

			`std::getline(cin, input);`
			`utf8proc_uint8_t * result = NULL;`

			`if (type == "NFC")`
			`{`
			`result = utf8proc_NFC((const utf8proc_uint8_t*) input.c_str());`
			`}`
			`else if (type == "NFD")`
			`{`
			`result = utf8proc_NFD((const utf8proc_uint8_t*) input.c_str());`
			`}`
			`else if (type == "NFKC")`
			`{`
			`result = utf8proc_NFKC((const utf8proc_uint8_t*) input.c_str());`
			`}`
			`else if (type == "NFKD")`
			`{`
			`result = utf8proc_NFKD((const utf8proc_uint8_t*) input.c_str());`
			`}`
			`else if (type == "NFKC_Casefold")`
			`{`
			`result = utf8proc_NFKC_Casefold((const utf8proc_uint8_t*) input.c_str());`
			`}`
			`else`
			`{`
			`cout << "Unknown type; valid types are NFC, NFD, NFKC, NFKD and NFKC_Casefold." << endl;`
			`return 41;`
			`}`

			`if (result)`
			`{`
			`cout << (const char*) result << endl;`
			`free((void*) result);`
			`}`

			`return 0;`
			`}`

Add representation mode. 2023-10-02 14:21:26 +02:00			`int representation(int argc, char ** argv)`
			`{`
			`for (uint i = 0; i < argc; i++)`
			`{`
			`string arg = argv[i];`
			`if (arg == "-h" \|\| arg == "--help")`
			`{`
			`representationShowHelp();`
			`return 0;`
			`}`
			`}`

			`utf8proc_int32_t codepoint = 0;`
			`string input;`
			`cin >> input;`
			`const utf8proc_uint8_t * inputArray = (const utf8proc_uint8_t *) input.c_str();`
			`// This stops at the first codepoint; with 'عَ', the first retrieved codepoint is 'ع'.`
			`utf8proc_ssize_t nb = utf8proc_iterate(&inputArray[0], -1, &codepoint);`
			`if (nb < 0)`
			`{`
			`cout << utf8proc_errmsg(nb) << endl;`
			`return nb * -1;`
			`}`
			`utf8proc_uint8_t firstCharArray[5];`
			`utf8proc_ssize_t nbOfBytesInFirstChar = utf8proc_encode_char(codepoint, firstCharArray);`
			`if (nbOfBytesInFirstChar == 0)`
			`{`
			`cout << "No valid bytes at start of input." << endl;`
			`return 51;`
			`}`
			`firstCharArray[nbOfBytesInFirstChar] = '\0';`

			`option longopts[] = {`
			`{"codepoint", no_argument, 0, 'p'},`
			`{"utf8", no_argument, 0, 'e'}, // 'e'ight`
			`{"utf16", no_argument, 0, 's'}, // 's'ixteen`
			`{"binary", no_argument, 0, 'b'},`
			`{"octal", no_argument, 0, 'o'},`
			`{"decimal", no_argument, 0, 'd'},`
			`{"xml", no_argument, 0, 'x'},`
			`{"tolower", no_argument, 0, 'L'},`
			`{"toupper", no_argument, 0, 'U'},`
			`{"totitle", no_argument, 0, 'T'},`
			`{0}};`

			`while (1) {`
			`const int opt = getopt_long(argc, argv, "pesbodxLUT", longopts, 0);`

			`if (opt == -1) {`
			`break;`
			`}`

			`switch (opt) {`
			`case 'p':`
			`cout << "Codepoint: " << valueRepresentation(codepoint, 'U') << endl;`
			`break;`
			`case 'e':`
			`cout << "UTF-8: ";`
			`for (uint i = 0; i < nbOfBytesInFirstChar; i++)`
			`{`
			`cout << valueRepresentation(firstCharArray[i], 16) << " ";`
			`}`
			`cout << endl;`
			`break;`
			`case 's':`
			`// https://en.wikipedia.org/wiki/UTF-16`
			`cout << "UTF-16: ";`
			`if (codepoint < 0xFFFF) // 2 bytes only`
			`{`
			`cout << valueRepresentation(codepoint, 17);`
			`}`
			`else // 4 bytes`
			`{`
			`utf8proc_int32_t intermediate = codepoint - 0x10000;`
			`utf8proc_int32_t shifted = intermediate >> 10; // Divide by 0x400 (1024)(2^10)`
			`utf8proc_int32_t highSurrogate = shifted + 0xD800;`
			`utf8proc_int32_t lowTenBits = intermediate % 0x400; // Same result with (intermediate & 1023)`
			`utf8proc_int32_t lowSurrogate = lowTenBits + 0xDC00;`
			`cout << valueRepresentation(highSurrogate, 17) << " ";`
			`cout << valueRepresentation(lowSurrogate, 17);`
			`}`
			`cout << endl;`
			`break;`
			`case 'b':`
			`cout << "Binary: ";`
			`for (uint i = 0; i < nbOfBytesInFirstChar; i++)`
			`{`
			`cout << valueRepresentation(firstCharArray[i], 2) << " ";`
			`}`
			`cout << endl;`
			`break;`
			`case 'o':`
			`cout << "Octal: ";`
			`for (uint i = 0; i < nbOfBytesInFirstChar; i++)`
			`{`
			`cout << valueRepresentation(firstCharArray[i], 8) << " ";`
			`}`
			`cout << endl;`
			`break;`
			`case 'd':`
			`cout << "Decimal: ";`
			`for (uint i = 0; i < nbOfBytesInFirstChar; i++)`
			`{`
			`cout << valueRepresentation(firstCharArray[i], 10) << " ";`
			`}`
			`cout << endl;`
			`break;`
			`case 'x':`
			`cout << "XML decimal: " << valueRepresentation(codepoint, 'x') << endl;`
			`break;`
			`case 'L':`
			`{`
			`utf8proc_int32_t lowerCodepoint = utf8proc_tolower(codepoint);`
			`utf8proc_uint8_t dst[5];`
			`utf8proc_ssize_t bytesWritten = utf8proc_encode_char(lowerCodepoint, &dst[0]);`
			`dst[bytesWritten] = '\0';`
			`cout << "To lower: " << (const char*) dst << endl;`
			`}`
			`break;`
			`case 'U':`
			`{`
			`utf8proc_int32_t upperCodepoint = utf8proc_toupper(codepoint);`
			`utf8proc_uint8_t dst[5];`
			`utf8proc_ssize_t bytesWritten = utf8proc_encode_char( upperCodepoint, &dst[0]);`
			`dst[bytesWritten] = '\0';`
			`cout << "To upper: " << (const char*) dst << endl;`
			`}`
			`break;`
			`case 'T':`
			`{`
			`utf8proc_int32_t upperCodepoint = utf8proc_totitle(codepoint);`
			`utf8proc_uint8_t dst[5];`
			`utf8proc_ssize_t bytesWritten = utf8proc_encode_char( upperCodepoint, &dst[0]);`
			`dst[bytesWritten] = '\0';`
			`cout << "To title: " << (const char*) dst << endl;`
			`}`
			`break;`
			`case 'h':`
			`representationShowHelp();`
			`return 0;`
			`case '?':`
			`return 50; // -5 to -1 are reserved by utf8proc; their absolute values are used here.`
			`default:`
			`break;`
			`}`
			`}`
			`// Show the processed character.`
			`cout << "Character: " << (const char*) firstCharArray << endl;`
			`return 0;`
			`}`

Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`int main(int argc, char ** argv)`
			`{`
Add representation mode. 2023-10-02 14:21:26 +02:00			`categoryDescription[UTF8PROC_CATEGORY_CN] = "Other, not assigned";`
			`categoryDescription[UTF8PROC_CATEGORY_LU] = "Letter, uppercase";`
			`categoryDescription[UTF8PROC_CATEGORY_LL] = "Letter, lowercase";`
			`categoryDescription[UTF8PROC_CATEGORY_LT] = "Letter, titlecase";`
			`categoryDescription[UTF8PROC_CATEGORY_LM] = "Letter, modifier";`
			`categoryDescription[UTF8PROC_CATEGORY_LO] = "Letter, other";`
			`categoryDescription[UTF8PROC_CATEGORY_MN] = "Mark, nonspacing";`
			`categoryDescription[UTF8PROC_CATEGORY_MC] = "Mark, spacing combining";`
			`categoryDescription[UTF8PROC_CATEGORY_ME] = "Mark, enclosing";`
			`categoryDescription[UTF8PROC_CATEGORY_NL] = "Number, letter";`
			`categoryDescription[UTF8PROC_CATEGORY_NO] = "Number, other";`
			`categoryDescription[UTF8PROC_CATEGORY_PC] = "Punctuation, connector";`
			`categoryDescription[UTF8PROC_CATEGORY_PD] = "Punctuation, dash";`
			`categoryDescription[UTF8PROC_CATEGORY_PS] = "Punctuation, open";`
			`categoryDescription[UTF8PROC_CATEGORY_PE] = "Punctuation, close";`
			`categoryDescription[UTF8PROC_CATEGORY_PI] = "Punctuation, initial quote";`
			`categoryDescription[UTF8PROC_CATEGORY_PF] = "Punctuation, final quote";`
			`categoryDescription[UTF8PROC_CATEGORY_PO] = "Punctuation, other";`
			`categoryDescription[UTF8PROC_CATEGORY_SM] = "Symbol, math";`
			`categoryDescription[UTF8PROC_CATEGORY_SC] = "Symbol, currency";`
			`categoryDescription[UTF8PROC_CATEGORY_SK] = "Symbol, modifier";`
			`categoryDescription[UTF8PROC_CATEGORY_SO] = "Symbol, other";`
			`categoryDescription[UTF8PROC_CATEGORY_ZS] = "Separator, space";`
			`categoryDescription[UTF8PROC_CATEGORY_ZL] = "Separator, line";`
			`categoryDescription[UTF8PROC_CATEGORY_ZP] = "Separator, paragraph";`
			`categoryDescription[UTF8PROC_CATEGORY_CC] = "Other, control";`
			`categoryDescription[UTF8PROC_CATEGORY_CF] = "Other, format";`
			`categoryDescription[UTF8PROC_CATEGORY_CS] = "Other, surrogate";`
			`categoryDescription[UTF8PROC_CATEGORY_CO] = "Other, private use";`

			`bidirectional[UTF8PROC_BIDI_CLASS_L] = "Left-to-Right";`
			`bidirectional[UTF8PROC_BIDI_CLASS_LRE] = "Left-to-Right Embedding";`
			`bidirectional[UTF8PROC_BIDI_CLASS_LRO] = "Left-to-Right Override";`
			`bidirectional[UTF8PROC_BIDI_CLASS_R] = "Right-to-Left";`
			`bidirectional[UTF8PROC_BIDI_CLASS_AL] = "Right-to-Left Arabic";`
			`bidirectional[UTF8PROC_BIDI_CLASS_RLE] = "Right-to-Left Embedding";`
			`bidirectional[UTF8PROC_BIDI_CLASS_RLO] = "Right-to-Left Override";`
			`bidirectional[UTF8PROC_BIDI_CLASS_PDF] = "Pop Directional Format";`
			`bidirectional[UTF8PROC_BIDI_CLASS_EN] = "European Number";`
			`bidirectional[UTF8PROC_BIDI_CLASS_ES] = "European Separator";`
			`bidirectional[UTF8PROC_BIDI_CLASS_ET] = "European Number Terminator";`
			`bidirectional[UTF8PROC_BIDI_CLASS_AN] = "Arabic Number";`
			`bidirectional[UTF8PROC_BIDI_CLASS_CS] = "Common Number Separator";`
			`bidirectional[UTF8PROC_BIDI_CLASS_NSM] = "Nonspacing Mark";`
			`bidirectional[UTF8PROC_BIDI_CLASS_BN] = "Boundary Neutral";`
			`bidirectional[UTF8PROC_BIDI_CLASS_B] = "Paragraph Separator";`
			`bidirectional[UTF8PROC_BIDI_CLASS_S] = "Segment Separator";`
			`bidirectional[UTF8PROC_BIDI_CLASS_WS] = "Whitespace";`
			`bidirectional[UTF8PROC_BIDI_CLASS_ON] = "Other Neutrals";`
			`bidirectional[UTF8PROC_BIDI_CLASS_LRI] = "Left-to-Right Isolate";`
			`bidirectional[UTF8PROC_BIDI_CLASS_RLI] = "Right-to-Left Isolate";`
			`bidirectional[UTF8PROC_BIDI_CLASS_FSI] = "First Strong Isolate";`
			`bidirectional[UTF8PROC_BIDI_CLASS_PDI] = "Pop Directional Isolate";`

			`// Whatever it means! But does it concern decomposed form only?`
			`decompositionType[0] = "Unknown"; // property->decomp_type is 0 on all tested characters, decomposed or not.`
			`decompositionType[UTF8PROC_DECOMP_TYPE_FONT] = "Font"; // Starts at 1.`
			`decompositionType[UTF8PROC_DECOMP_TYPE_NOBREAK] = "Nobreak";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_INITIAL] = "Initial";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_MEDIAL] = "Medial";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_FINAL] = "Final";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_ISOLATED] = "Isolated";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_CIRCLE] = "Circle";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_SUPER] = "Super";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_SUB] = "Sub";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_VERTICAL] = "Vertical";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_WIDE] = "Wide";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_NARROW] = "Narrow";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_SMALL] = "Small";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_SQUARE] = "Square";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_FRACTION] = "Fraction";`
			`decompositionType[UTF8PROC_DECOMP_TYPE_COMPAT] = "Compat";`

			`// Whatever most values mean!`
			`boundClass[UTF8PROC_BOUNDCLASS_START] = "Start";`
			`boundClass[UTF8PROC_BOUNDCLASS_OTHER] = "Other";`
			`boundClass[UTF8PROC_BOUNDCLASS_CR] = "Cr";`
			`boundClass[UTF8PROC_BOUNDCLASS_LF] = "Lf";`
			`boundClass[UTF8PROC_BOUNDCLASS_CONTROL] = "Control";`
			`boundClass[UTF8PROC_BOUNDCLASS_EXTEND] = "Extend";`
			`boundClass[UTF8PROC_BOUNDCLASS_L] = "L";`
			`boundClass[UTF8PROC_BOUNDCLASS_V] = "V";`
			`boundClass[UTF8PROC_BOUNDCLASS_T] = "T";`
			`boundClass[UTF8PROC_BOUNDCLASS_LV] = "Lv";`
			`boundClass[UTF8PROC_BOUNDCLASS_LVT] = "Lvt";`
			`boundClass[UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR] = "Regional indicator";`
			`boundClass[UTF8PROC_BOUNDCLASS_SPACINGMARK] = "Spacingmark";`
			`boundClass[UTF8PROC_BOUNDCLASS_PREPEND] = "Prepend";`
			`boundClass[UTF8PROC_BOUNDCLASS_ZWJ] = "Zero Width Joiner";`
			`boundClass[UTF8PROC_BOUNDCLASS_E_BASE] = "Emoji Base";`
			`boundClass[UTF8PROC_BOUNDCLASS_E_MODIFIER] = "Emoji Modifier";`
			`boundClass[UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ] = "Glue_After_ZWJ";`
			`boundClass[UTF8PROC_BOUNDCLASS_E_BASE_GAZ] = "E_BASE + GLUE_AFTER_ZJW";`
			`boundClass[UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC] = "Extended_Pictographic",`
			`boundClass[UTF8PROC_BOUNDCLASS_E_ZWG] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ";`

			`const char * modeInfo = "A mode of operation is required: unaccent, normalize, representation."`
			`"\nPass '--help' for more information in each mode.";`
Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`const int sargc = argc - 1;`
			`if (sargc == 0)`
			`{`
			`cout << modeInfo << endl;`
			`return 20;`
			`}`
			`char * sargv[sargc];`
			`for (uint i = 0; i < argc; i++)`
			`{`
			`if (i < 1)`
			`{`
			`sargv[i] = argv[i];`
			`continue;`
			`}`
			`if (i == 1)`
			`continue;`
			`sargv[i - 1] = argv[i];`
			`}`

			`int ret = 0;`
			`const string mode(argv[1]);`
			`if (mode == "unaccent")`
			`{`
			`ret = unaccent(sargc, sargv);`
			`}`
			`else if (mode == "normalize")`
			`{`
			`ret = normalize(sargc, sargv);`
			`}`
Add representation mode. 2023-10-02 14:21:26 +02:00			`else if (mode == "representation")`
			`{`
			`ret = representation (sargc, sargv);`
			`}`
Initial commit. Implement unaccent, normalize. 2023-09-30 17:51:15 +02:00			`else`
			`{`
			`cout << modeInfo << endl;`
			`return 20;`
			`}`

			`return ret;`
			`}`