feat(unitproc/graphu):Tally UTF-8 characters

author Steven Baltakatei Sandoval <baltakatei@gmail.com>

Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)

committer Steven Baltakatei Sandoval <baltakatei@gmail.com>

Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)
author Steven Baltakatei Sandoval <baltakatei@gmail.com>
Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)
committer Steven Baltakatei Sandoval <baltakatei@gmail.com>
Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)
diff --git a/unitproc/graphu b/unitproc/graphu

new file mode 100644 (file)

index 0000000..a940c2c
--- /dev/null
+++ b/unitproc/graphu
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Desc: Tallies UTF-8 characters and lists Unicode points
+# Usage: graphu [path]
+# Example: graphu file.txt | paste - - -
+# Ref/attrib: A data cleaner's cookbook.  https://www.datafix.com.au/cookbook/characters1.html
+# Depends: pv, iconv, awk, xxd
+# Version: 0.0.1
+
+n=0;
+pv -pbt "$1" |\
+    awk '
+      # BEGIN block: This runs before any input is read.
+      BEGIN {
+        # Set the field separator to an empty string, so every character becomes a separate field.
+        FS = "";
+      }
+
+      # Main processing block: Execute for each input line.
+      {
+        # Loop through each character in the current line.
+        for (idx = 1; idx <= NF; idx++) {
+          # Only consider graphical (printable) characters, ignoring spaces/control characters.
+          if ($idx ~ /[[:graph:]]/) {
+            # Increment the count for this character in our associative array.
+            charFrequency[$idx]++;
+          }
+        }
+      }
+
+      # END block: This runs after all input has been processed.
+      END {
+        # Loop through the associative array and print the count and the character.
+        for (char in charFrequency) {
+          # Print the frequency count followed by a tab and the character.
+          printf("%d\t%s\n", charFrequency[char], char);
+        }
+      }
+    ' |\
+    sort -t $'\t' -k2 |\
+    while read -r line; do
+        tally="$(cut -f1 <<< "$line"; )";
+        character="$(cut -f2 <<< "$line"; )";
+        unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')";
+        declare -p line unicode_point n tally character >> /tmp/log.txt;  # debug
+        printf "\n" >> /tmp/log.txt;
+        printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character";
+        ((n++));
+        unset tally character unicode_point;
+    done | sort -k2;
+exit;
author	Steven Baltakatei Sandoval <baltakatei@gmail.com>
	Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)
committer	Steven Baltakatei Sandoval <baltakatei@gmail.com>
	Sun, 13 Apr 2025 10:20:30 +0000 (10:20 +0000)