--- /dev/null
+#!/bin/bash
+# Desc: Tallies UTF-8 characters and lists Unicode points
+# Usage: graphu [path]
+# Example: graphu file.txt | paste - - -
+# Ref/attrib: A data cleaner's cookbook. https://www.datafix.com.au/cookbook/characters1.html
+# Depends: pv, iconv, awk, xxd
+# Version: 0.0.1
+
+n=0;
+pv -pbt "$1" |\
+ awk '
+ # BEGIN block: This runs before any input is read.
+ BEGIN {
+ # Set the field separator to an empty string, so every character becomes a separate field.
+ FS = "";
+ }
+
+ # Main processing block: Execute for each input line.
+ {
+ # Loop through each character in the current line.
+ for (idx = 1; idx <= NF; idx++) {
+ # Only consider graphical (printable) characters, ignoring spaces/control characters.
+ if ($idx ~ /[[:graph:]]/) {
+ # Increment the count for this character in our associative array.
+ charFrequency[$idx]++;
+ }
+ }
+ }
+
+ # END block: This runs after all input has been processed.
+ END {
+ # Loop through the associative array and print the count and the character.
+ for (char in charFrequency) {
+ # Print the frequency count followed by a tab and the character.
+ printf("%d\t%s\n", charFrequency[char], char);
+ }
+ }
+ ' |\
+ sort -t $'\t' -k2 |\
+ while read -r line; do
+ tally="$(cut -f1 <<< "$line"; )";
+ character="$(cut -f2 <<< "$line"; )";
+ unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')";
+ declare -p line unicode_point n tally character >> /tmp/log.txt; # debug
+ printf "\n" >> /tmp/log.txt;
+ printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character";
+ ((n++));
+ unset tally character unicode_point;
+ done | sort -k2;
+exit;