#!/bin/bash
# Desc: Tallies UTF-8 characters and lists Unicode points
# Usage: graphu [path]
# Example: graphu file.txt | paste - - -
# Ref/attrib: A data cleaner's cookbook.  https://www.datafix.com.au/cookbook/characters1.html
# Depends: pv, iconv, awk, xxd
# Version: 0.0.1

n=0;
pv -pbt "$1" |\
    awk '
      # BEGIN block: This runs before any input is read.
      BEGIN {
        # Set the field separator to an empty string, so every character becomes a separate field.
        FS = "";
      }

      # Main processing block: Execute for each input line.
      {
        # Loop through each character in the current line.
        for (idx = 1; idx <= NF; idx++) {
          # Only consider graphical (printable) characters, ignoring spaces/control characters.
          if ($idx ~ /[[:graph:]]/) {
            # Increment the count for this character in our associative array.
            charFrequency[$idx]++;
          }
        }
      }

      # END block: This runs after all input has been processed.
      END {
        # Loop through the associative array and print the count and the character.
        for (char in charFrequency) {
          # Print the frequency count followed by a tab and the character.
          printf("%d\t%s\n", charFrequency[char], char);
        }
      }
    ' |\
    sort -t $'\t' -k2 |\
    while read -r line; do
        tally="$(cut -f1 <<< "$line"; )";
        character="$(cut -f2 <<< "$line"; )";
        unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')";
        declare -p line unicode_point n tally character >> /tmp/log.txt;  # debug
        printf "\n" >> /tmp/log.txt;
        printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character";
        ((n++));
        unset tally character unicode_point;
    done | sort -k2;
exit;