Add properties mode.

2023-10-02 14:23:44 +02:00
parent 2a236d10c9
commit 54e19be84a
1 changed files with 115 additions and 1 deletions
--- a/main.cpp
+++ b/main.cpp
@@ -103,6 +103,21 @@ void representationShowHelp()
    cout << message << endl;
 }

+void propertiesShowHelp()
+{
+    string message("This operational mode displays properties of the first identified codepoint."
+    "\n\n  -l, --islower: displays 1 if the codepoint refers to a lower-case character, 0 otherwise"
+    "\n  -u, --isupper: displays 1 if the codepoint refers to an upper-case character, 0 otherwise"
+    "\n  -c, --category: determines the category of a codepoint (Letter, Number, Symbol...)"
+    "\n  -n, --direction: determines the bidirectional class of a codepoint; see utf8proc.h"
+    "\n  -i, --decompositiontype: determines the decomposition type of a codepoint; see utf8proc.h"
+    "\n  -k, --boundclass: determines the boundclass property of a codepoint; see utf8proc.h"
+    "\n  -h, --help: show this message"
+    "\n\nThe input can be piped in or read from stdin. Pass in a single character for simplicity.");
+    
+    cout << message << endl;
+}
+
 int unaccent(int argc, char **argv) {
    string input;
    int options  = STRIP_OPTIONS_DEFAULT;
@@ -401,6 +416,101 @@ int representation(int argc, char ** argv)
    return 0;
 }

+int properties(int argc, char ** argv)
+{
+    for (uint i = 0; i < argc; i++)
+    {
+        string arg = argv[i];
+        if (arg == "-h" || arg == "--help")
+        {
+            propertiesShowHelp();
+            return 0;
+        }
+    }
+    
+    utf8proc_int32_t codepoint = 0;
+    string input;
+    cin >> input;
+    const utf8proc_uint8_t * inputArray = (const utf8proc_uint8_t *) input.c_str();
+    // This stops at the first codepoint; with 'عَ', the first retrieved codepoint is 'ع'.
+    utf8proc_ssize_t nb = utf8proc_iterate(&inputArray[0], -1, &codepoint);
+    if (nb < 0)
+    {
+        cout << utf8proc_errmsg(nb) << endl;
+        return nb * -1;
+    }
+    utf8proc_uint8_t firstCharArray[5];
+    utf8proc_ssize_t nbOfBytesInFirstChar = utf8proc_encode_char(codepoint, firstCharArray);
+    if (nbOfBytesInFirstChar == 0)
+    {
+        cout << "No valid bytes at start of input." << endl;
+        return 51;
+    }
+    firstCharArray[nbOfBytesInFirstChar] = '\0';
+    
+    option longopts[] = {
+        {"islower", no_argument, 0, 'l'},
+        {"isupper", no_argument, 0, 'u'},
+        {"category", no_argument, 0, 'c'},
+        {"direction", no_argument, 0, 'd'},
+        {"decompositiontype", no_argument, 0, 'i'}, // decompos'i'tion
+        {"boundclass", no_argument, 0, 'b'},
+        {0}};
+        
+        while (1) {
+            const int opt = getopt_long(argc, argv, "lucdib", longopts, 0);
+            
+            if (opt == -1) {
+                break;
+            }
+            
+            switch (opt) {
+                case 'l':
+                    cout << "Is lower: " << utf8proc_islower(codepoint) << endl;
+                    break;
+                case 'u':
+                    cout << "Is upper: " << utf8proc_isupper(codepoint) << endl;
+                    break;
+                case 'c':
+                {
+                    utf8proc_category_t category = utf8proc_category(codepoint);
+                    cout << "Category: [" << utf8proc_category_string(codepoint) << "] ";
+                    cout << categoryDescription[category] << endl;
+                }
+                break;
+                case 'd':
+                {
+                    const utf8proc_property_t * property = utf8proc_get_property(codepoint);
+                    cout << "Direction: " << bidirectional[property->bidi_class] << endl;
+                }
+                break;
+                case 'i':
+                {
+                    const utf8proc_property_t * property = utf8proc_get_property(codepoint);
+                    cout << "Decomposition type: " << decompositionType[property->decomp_type] << endl;
+                }
+                break;
+                case 'b':
+                {
+                    // property->boundclass is 1 (other) on all tested characters.
+                    const utf8proc_property_t * property = utf8proc_get_property(codepoint);
+                    cout << "Bound class: " << boundClass[property->boundclass] << endl;
+                }
+                break;
+                case 'h':
+                    representationShowHelp();
+                    return 0;
+                case '?':
+                    return 50; // -5 to -1 are reserved by utf8proc; their absolute values are used here.
+                default:
+                    break;
+            }
+        }
+        // Show the processed character.
+        cout << "Character: " << (const char*) firstCharArray << endl;
+        return 0;
+}
+
 int main(int argc, char ** argv)
 {
    categoryDescription[UTF8PROC_CATEGORY_CN] = "Other, not assigned";
@@ -499,7 +609,7 @@ int main(int argc, char ** argv)
    boundClass[UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC] = "Extended_Pictographic",
    boundClass[UTF8PROC_BOUNDCLASS_E_ZWG] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC + ZWJ";
    
-    const char * modeInfo = "A mode of operation is required: unaccent, normalize, representation."
+    const char * modeInfo = "A mode of operation is required: unaccent, normalize, representation, properties."
     "\nPass '--help' for more information in each mode.";
    const int sargc = argc - 1;
    if (sargc == 0)
@@ -534,6 +644,10 @@ int main(int argc, char ** argv)
    {
        ret = representation (sargc, sargv);
    }
+    else if (mode == "properties")
+    {
+        ret = properties(sargc, sargv);
+    }
    else
    {
        cout << modeInfo << endl;