feat(user/mw_wc2sp.sh):Add script to convert wikicode to mw subpages
[BK-2020-03.git] / user / c / src / count_char_nv.c
1 #include <unistd.h>
2 #include <stdio.h>
3
4 /*
5 Desc: A program to print the frequency of different UTF-8 characters
6 taking into account the presence of diacritical marks.
7 Depends: glibc >2.35
8 Info: Diacritical marks are found in Unicode blocks:
9 - U+0300-036F: Combining Diacritical Marks
10 - U+1AB0-1AFF: Combining Diacritical Marks Extended
11 - U+1DC0-1DFF: Combining Diacritical Marks Supplement
12 - U+20D0-20FF: Combining Diacritical Marks for Symbols
13 - U+FE20-FE2F: Combining Half Marks
14 Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/
15 */
16
17 int main() {
18 usleep(10000);
19 int c;
20 long nc;
21
22 nc = 0;
23 printf("%5s,%5s,%9s\n","dec","hex","bin");
24 while ( (c = getchar()) != EOF) {
25 printf("%5d,%5x,%9b\n",c,c,c);
26 ++nc;
27 };
28
29 printf("Character count:%ld\n",nc);
30 return 0;
31 };
32
33 /* Strategy
34
35 - Define table of valid Navajo graphemes
36
37 - Define map of precomposed characters and combining mark permutations
38 to graphemes in the table.
39
40 - Read input sequentially, incrementing a list of integer counts of
41 graphemes detected.
42
43 - Print grapheme totals.
44
45 */
46
47 /* Process
48
49 1. Read char int into c via c = getchar()
50 2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1)
51 a. If ASCII, increment nc, continue to next loop.
52 b. If multibyte, then calculate Unicode code point.
53 3. Detect if code point falls into known combining mark ranges.
54 a. If comark, continue to next loop.
55 b. If not comark increment nc, continue to next loop.
56
57 */
58
59 // Author: Steven Baltakatei Sandoval
60 // License: GPLv3+