unitproc/graphu

   1 #!/bin/bash
   2 # Desc: Tallies UTF-8 characters and lists Unicode points
   3 # Usage: graphu [path]
   4 # Example: graphu file.txt | paste - - -
   5 # Ref/attrib: A data cleaner's cookbook.  https://www.datafix.com.au/cookbook/characters1.html
   6 # Depends: pv, iconv, awk, xxd
   7 # Version: 0.0.1
   8
   9 n=0;
  10 pv -pbt "$1" |\
  11     awk '
  12       # BEGIN block: This runs before any input is read.
  13       BEGIN {
  14         # Set the field separator to an empty string, so every character becomes a separate field.
  15         FS = "";
  16       }
  17
  18       # Main processing block: Execute for each input line.
  19       {
  20         # Loop through each character in the current line.
  21         for (idx = 1; idx <= NF; idx++) {
  22           # Only consider graphical (printable) characters, ignoring spaces/control characters.
  23           if ($idx ~ /[[:graph:]]/) {
  24             # Increment the count for this character in our associative array.
  25             charFrequency[$idx]++;
  26           }
  27         }
  28       }
  29
  30       # END block: This runs after all input has been processed.
  31       END {
  32         # Loop through the associative array and print the count and the character.
  33         for (char in charFrequency) {
  34           # Print the frequency count followed by a tab and the character.
  35           printf("%d\t%s\n", charFrequency[char], char);
  36         }
  37       }
  38     ' |\
  39     sort -t $'\t' -k2 |\
  40     while read -r line; do
  41         tally="$(cut -f1 <<< "$line"; )";
  42         character="$(cut -f2 <<< "$line"; )";
  43         unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')";
  44         declare -p line unicode_point n tally character >> /tmp/log.txt;  # debug
  45         printf "\n" >> /tmp/log.txt;
  46         printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character";
  47         ((n++));
  48         unset tally character unicode_point;
  49     done | sort -k2;
  50 exit;