From: Steven Baltakatei Sandoval Date: Sun, 13 Apr 2025 10:20:30 +0000 (+0000) Subject: feat(unitproc/graphu):Tally UTF-8 characters X-Git-Url: https://zdv2.bktei.com/gitweb/BK-2020-03.git/commitdiff_plain/15af0db8848eaa129fba08d3a37326d37b4de5c3?ds=inline feat(unitproc/graphu):Tally UTF-8 characters --- diff --git a/unitproc/graphu b/unitproc/graphu new file mode 100644 index 0000000..a940c2c --- /dev/null +++ b/unitproc/graphu @@ -0,0 +1,50 @@ +#!/bin/bash +# Desc: Tallies UTF-8 characters and lists Unicode points +# Usage: graphu [path] +# Example: graphu file.txt | paste - - - +# Ref/attrib: A data cleaner's cookbook. https://www.datafix.com.au/cookbook/characters1.html +# Depends: pv, iconv, awk, xxd +# Version: 0.0.1 + +n=0; +pv -pbt "$1" |\ + awk ' + # BEGIN block: This runs before any input is read. + BEGIN { + # Set the field separator to an empty string, so every character becomes a separate field. + FS = ""; + } + + # Main processing block: Execute for each input line. + { + # Loop through each character in the current line. + for (idx = 1; idx <= NF; idx++) { + # Only consider graphical (printable) characters, ignoring spaces/control characters. + if ($idx ~ /[[:graph:]]/) { + # Increment the count for this character in our associative array. + charFrequency[$idx]++; + } + } + } + + # END block: This runs after all input has been processed. + END { + # Loop through the associative array and print the count and the character. + for (char in charFrequency) { + # Print the frequency count followed by a tab and the character. + printf("%d\t%s\n", charFrequency[char], char); + } + } + ' |\ + sort -t $'\t' -k2 |\ + while read -r line; do + tally="$(cut -f1 <<< "$line"; )"; + character="$(cut -f2 <<< "$line"; )"; + unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')"; + declare -p line unicode_point n tally character >> /tmp/log.txt; # debug + printf "\n" >> /tmp/log.txt; + printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character"; + ((n++)); + unset tally character unicode_point; + done | sort -k2; +exit;