2 # Desc: Tallies UTF-8 characters and lists Unicode points 
   4 # Example: graphu file.txt | paste - - - 
   5 # Ref/attrib: A data cleaner's cookbook.  https://www.datafix.com.au/cookbook/characters1.html 
   6 # Depends: pv, iconv, awk, xxd 
  12       # BEGIN block: This runs before any input is read. 
  14         # Set the field separator to an empty string, so every character becomes a separate field. 
  18       # Main processing block: Execute for each input line. 
  20         # Loop through each character in the current line. 
  21         for (idx = 1; idx <= NF; idx++) { 
  22           # Only consider graphical (printable) characters, ignoring spaces/control characters. 
  23           if ($idx ~ /[[:graph:]]/) { 
  24             # Increment the count for this character in our associative array. 
  25             charFrequency[$idx]++; 
  30       # END block: This runs after all input has been processed. 
  32         # Loop through the associative array and print the count and the character. 
  33         for (char in charFrequency) { 
  34           # Print the frequency count followed by a tab and the character. 
  35           printf("%d\t%s\n", charFrequency[char], char); 
  40     while read -r line
; do 
  41         tally
="$(cut -f1 <<< "$line"; )"; 
  42         character
="$(cut -f2 <<< "$line"; )"; 
  43         unicode_point
="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u
%s
",$2)}')"; 
  44         declare -p line unicode_point n tally character 
>> /tmp
/log.txt
;  # debug 
  45         printf "\n" >> /tmp
/log.txt
; 
  46         printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character"; 
  48         unset tally character unicode_point
;