#!/bin/bash # Desc: Tallies UTF-8 characters and lists Unicode points # Usage: graphu [path] # Example: graphu file.txt | paste - - - # Ref/attrib: A data cleaner's cookbook. https://www.datafix.com.au/cookbook/characters1.html # Depends: pv, iconv, awk, xxd # Version: 0.0.1 n=0; pv -pbt "$1" |\ awk ' # BEGIN block: This runs before any input is read. BEGIN { # Set the field separator to an empty string, so every character becomes a separate field. FS = ""; } # Main processing block: Execute for each input line. { # Loop through each character in the current line. for (idx = 1; idx <= NF; idx++) { # Only consider graphical (printable) characters, ignoring spaces/control characters. if ($idx ~ /[[:graph:]]/) { # Increment the count for this character in our associative array. charFrequency[$idx]++; } } } # END block: This runs after all input has been processed. END { # Loop through the associative array and print the count and the character. for (char in charFrequency) { # Print the frequency count followed by a tab and the character. printf("%d\t%s\n", charFrequency[char], char); } } ' |\ sort -t $'\t' -k2 |\ while read -r line; do tally="$(cut -f1 <<< "$line"; )"; character="$(cut -f2 <<< "$line"; )"; unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')"; declare -p line unicode_point n tally character >> /tmp/log.txt; # debug printf "\n" >> /tmp/log.txt; printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character"; ((n++)); unset tally character unicode_point; done | sort -k2; exit;