]> zdv2.bktei.com Git - BK-2020-03.git/blob - unitproc/graphu
feat(user/htmlz_to_cbz):Deduplicate consecutive repeat img urls
[BK-2020-03.git] / unitproc / graphu
1 #!/bin/bash
2 # Desc: Tallies UTF-8 characters and lists Unicode points
3 # Usage: graphu [path]
4 # Example: graphu file.txt | paste - - -
5 # Ref/attrib: A data cleaner's cookbook. https://www.datafix.com.au/cookbook/characters1.html
6 # Depends: pv, iconv, awk, xxd
7 # Version: 0.0.1
8
9 n=0;
10 pv -pbt "$1" |\
11 awk '
12 # BEGIN block: This runs before any input is read.
13 BEGIN {
14 # Set the field separator to an empty string, so every character becomes a separate field.
15 FS = "";
16 }
17
18 # Main processing block: Execute for each input line.
19 {
20 # Loop through each character in the current line.
21 for (idx = 1; idx <= NF; idx++) {
22 # Only consider graphical (printable) characters, ignoring spaces/control characters.
23 if ($idx ~ /[[:graph:]]/) {
24 # Increment the count for this character in our associative array.
25 charFrequency[$idx]++;
26 }
27 }
28 }
29
30 # END block: This runs after all input has been processed.
31 END {
32 # Loop through the associative array and print the count and the character.
33 for (char in charFrequency) {
34 # Print the frequency count followed by a tab and the character.
35 printf("%d\t%s\n", charFrequency[char], char);
36 }
37 }
38 ' |\
39 sort -t $'\t' -k2 |\
40 while read -r line; do
41 tally="$(cut -f1 <<< "$line"; )";
42 character="$(cut -f2 <<< "$line"; )";
43 unicode_point="$(cut -f2 <<< "$line" | iconv -f utf-8 -t UNICODEBIG | xxd -g 2 | awk '{printf("u%s",$2)}')";
44 declare -p line unicode_point n tally character >> /tmp/log.txt; # debug
45 printf "\n" >> /tmp/log.txt;
46 printf "%d\t%s\t%s\n" "$tally" "$unicode_point" "$character";
47 ((n++));
48 unset tally character unicode_point;
49 done | sort -k2;
50 exit;