Commit | Line | Data |
---|---|---|
d8d899f8 SBS |
1 | #include <unistd.h> |
2 | #include <stdio.h> | |
3 | ||
4 | /* | |
5 | Desc: A program to print the frequency of different UTF-8 characters | |
6 | taking into account the presence of diacritical marks. | |
7 | Depends: glibc >2.35 | |
8 | Info: Diacritical marks are found in Unicode blocks: | |
9 | - U+0300-036F: Combining Diacritical Marks | |
10 | - U+1AB0-1AFF: Combining Diacritical Marks Extended | |
11 | - U+1DC0-1DFF: Combining Diacritical Marks Supplement | |
12 | - U+20D0-20FF: Combining Diacritical Marks for Symbols | |
13 | - U+FE20-FE2F: Combining Half Marks | |
14 | Ref/Attrib: UTF-8 byte mechanics: https://www.johndcook.com/blog/2019/09/09/how-utf-8-works/ | |
15 | */ | |
16 | ||
17 | int main() { | |
18 | usleep(10000); | |
19 | int c; | |
20 | long nc; | |
21 | ||
22 | nc = 0; | |
23 | printf("%5s,%5s,%9s\n","dec","hex","bin"); | |
24 | while ( (c = getchar()) != EOF) { | |
25 | printf("%5d,%5x,%9b\n",c,c,c); | |
26 | ++nc; | |
27 | }; | |
28 | ||
29 | printf("Character count:%ld\n",nc); | |
30 | return 0; | |
31 | }; | |
32 | ||
33 | /* Strategy | |
34 | ||
35 | - Define table of valid Navajo graphemes | |
36 | ||
37 | - Define map of precomposed characters and combining mark permutations | |
38 | to graphemes in the table. | |
39 | ||
40 | - Read input sequentially, incrementing a list of integer counts of | |
41 | graphemes detected. | |
42 | ||
43 | - Print grapheme totals. | |
44 | ||
45 | */ | |
46 | ||
47 | /* Process | |
48 | ||
49 | 1. Read char int into c via c = getchar() | |
50 | 2. Detect if int c is ASCII (c within [32-126]) or multibyte (first bit 1) | |
51 | a. If ASCII, increment nc, continue to next loop. | |
52 | b. If multibyte, then calculate Unicode code point. | |
53 | 3. Detect if code point falls into known combining mark ranges. | |
54 | a. If comark, continue to next loop. | |
55 | b. If not comark increment nc, continue to next loop. | |
56 | ||
57 | */ | |
58 | ||
59 | // Author: Steven Baltakatei Sandoval | |
60 | // License: GPLv3+ |