user/c/src/count_char_nv.c

   1 #include <unistd.h>
   2 #include <stdio.h>
   3
   4 /*
   5 Desc: A program to print the frequency of different UTF-8 characters
   6       taking into account the presence of diacritical marks.
   7 Depends: glibc >2.35
   8 Info: Diacritical marks are found in Unicode blocks:
   9   - U+0300-036F: Combining Diacritical Marks
  10   - U+1AB0-1AFF: Combining Diacritical Marks Extended
  11   - U+1DC0-1DFF: Combining Diacritical Marks Supplement
  12   - U+20D0-20FF: Combining Diacritical Marks for Symbols
  13   - U+FE20-FE2F: Combining Half Marks
  14 Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
  15 */
  16
  17 int main() {
  18   usleep(10000);
  19   int c;
  20   long nc;
  21
  22   nc = 0;
  23   printf("%5s,%5s,%9s\n","dec","hex","bin");
  24   while ( (c = getchar()) != EOF) {
  25     printf("%5d,%5x,%9b\n",c,c,c);
  26     ++nc;
  27   };
  28
  29   printf("Character count:%ld\n",nc);
  30   return 0;
  31 };
  32
  33 /* Strategy
  34
  35 - Define table of valid Navajo graphemes
  36
  37 - Define map of precomposed characters and combining mark permutations
  38   to graphemes in the table.
  39
  40 - Read input sequentially, incrementing a list of integer counts of
  41   graphemes detected.
  42
  43 - Print grapheme totals.
  44
  45 */
  46
  47 /* Process
  48
  49 1. Read char int into c via c = getchar()
  50 2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1)
  51   a. If ASCII, increment nc, continue to next loop.
  52   b. If multibyte, then calculate Unicode code point.
  53 3. Detect if code point falls into known combining mark ranges.
  54   a. If comark, continue to next loop.
  55   b. If not comark increment nc, continue to next loop.
  56
  57 */
  58
  59 // Author: Steven Baltakatei Sandoval
  60 // License: GPLv3+