user/c/src/count_char_nv.c

#include <unistd.h>
#include <stdio.h>

/*
Desc: A program to print the frequency of different UTF-8 characters
      taking into account the presence of diacritical marks.
Depends: glibc >2.35
Info: Diacritical marks are found in Unicode blocks:
  - U+0300-036F: Combining Diacritical Marks
  - U+1AB0-1AFF: Combining Diacritical Marks Extended
  - U+1DC0-1DFF: Combining Diacritical Marks Supplement
  - U+20D0-20FF: Combining Diacritical Marks for Symbols
  - U+FE20-FE2F: Combining Half Marks
Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
*/

int main() {
  usleep(10000);
  int c;
  long nc;

  nc = 0;
  printf("%5s,%5s,%9s\n","dec","hex","bin");
  while ( (c = getchar()) != EOF) {
    printf("%5d,%5x,%9b\n",c,c,c);
    ++nc;
  };

  printf("Character count:%ld\n",nc);
  return 0;
};

/* Strategy

- Define table of valid Navajo graphemes

- Define map of precomposed characters and combining mark permutations
  to graphemes in the table.

- Read input sequentially, incrementing a list of integer counts of
  graphemes detected.

- Print grapheme totals.

*/

/* Process

1. Read char int into c via c = getchar()
2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1)
  a. If ASCII, increment nc, continue to next loop.
  b. If multibyte, then calculate Unicode code point.
3. Detect if code point falls into known combining mark ranges.
  a. If comark, continue to next loop.
  b. If not comark increment nc, continue to next loop.

*/

// Author: Steven Baltakatei Sandoval
// License: GPLv3+
Commit	Line	Data
	1	#include <unistd.h>
	2	#include <stdio.h>
	3
	4	/*
	5	Desc: A program to print the frequency of different UTF-8 characters
	6	taking into account the presence of diacritical marks.
	7	Depends: glibc >2.35
	8	Info: Diacritical marks are found in Unicode blocks:
	9	- U+0300-036F: Combining Diacritical Marks
	10	- U+1AB0-1AFF: Combining Diacritical Marks Extended
	11	- U+1DC0-1DFF: Combining Diacritical Marks Supplement
	12	- U+20D0-20FF: Combining Diacritical Marks for Symbols
	13	- U+FE20-FE2F: Combining Half Marks
	14	Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
	15	*/
	16
	17	int main() {
	18	usleep(10000);
	19	int c;
	20	long nc;
	21
	22	nc = 0;
	23	printf("%5s,%5s,%9s\n","dec","hex","bin");
	24	while ( (c = getchar()) != EOF) {
	25	printf("%5d,%5x,%9b\n",c,c,c);
	26	++nc;
	27	};
	28
	29	printf("Character count:%ld\n",nc);
	30	return 0;
	31	};
	32
	33	/* Strategy
	34
	35	- Define table of valid Navajo graphemes
	36
	37	- Define map of precomposed characters and combining mark permutations
	38	to graphemes in the table.
	39
	40	- Read input sequentially, incrementing a list of integer counts of
	41	graphemes detected.
	42
	43	- Print grapheme totals.
	44
	45	*/
	46
	47	/* Process
	48
	49	1. Read char int into c via c = getchar()
	50	2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1)
	51	a. If ASCII, increment nc, continue to next loop.
	52	b. If multibyte, then calculate Unicode code point.
	53	3. Detect if code point falls into known combining mark ranges.
	54	a. If comark, continue to next loop.
	55	b. If not comark increment nc, continue to next loop.
	56
	57	*/
	58
	59	// Author: Steven Baltakatei Sandoval
	60	// License: GPLv3+