#include <unistd.h>
#include <stdio.h>

/*
Desc: A program to print the frequency of different UTF-8 characters
      taking into account the presence of diacritical marks.
Depends: glibc >2.35
Info: Diacritical marks are found in Unicode blocks:
  - U+0300-036F: Combining Diacritical Marks
  - U+1AB0-1AFF: Combining Diacritical Marks Extended
  - U+1DC0-1DFF: Combining Diacritical Marks Supplement
  - U+20D0-20FF: Combining Diacritical Marks for Symbols
  - U+FE20-FE2F: Combining Half Marks
Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
*/

int main() {
  usleep(10000);
  int c;
  long nc;
  
  nc = 0;
  printf("%5s,%5s,%9s\n","dec","hex","bin");
  while ( (c = getchar()) != EOF) {
    printf("%5d,%5x,%9b\n",c,c,c);
    ++nc;
  };    

  printf("Character count:%ld\n",nc);
  return 0;
};

/* Strategy

- Define table of valid Navajo graphemes

- Define map of precomposed characters and combining mark permutations
  to graphemes in the table.

- Read input sequentially, incrementing a list of integer counts of
  graphemes detected.

- Print grapheme totals.

*/

/* Process

1. Read char int into c via c = getchar()
2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1)
  a. If ASCII, increment nc, continue to next loop.
  b. If multibyte, then calculate Unicode code point.
3. Detect if code point falls into known combining mark ranges.
  a. If comark, continue to next loop.
  b. If not comark increment nc, continue to next loop.

*/

// Author: Steven Baltakatei Sandoval
// License: GPLv3+
