feat(user/transcribe_whisper.sh):Specify dir_in via arg
[BK-2020-03.git] / unitproc / bkt-replace_apos
CommitLineData
265d3d9a
SBS
1#!/bin/bash
2
0736b7d0 3function replace_apos() {
011613d8
SBS
4 # Desc: Replace ' with ʼ in text
5 # Note: In UTF-8 text file, replaces U+0027
265d3d9a 6 # APOSTROPHE with U+02BC MODIFIER LETTER APOSTROPHE
39496272 7 # Input: stdin
265d3d9a
SBS
8 # arg1 file path
9 # Output: stdout
595c2cf9 10 # Version: 1.2.0 (BK-2020-03)
265d3d9a
SBS
11 # Depends: GNU sed 4.8
12
13 # Check input
14 if [[ "$#" -gt 1 ]]; then
15 echo "FATAL:Incorrect argument count:$#" 1>&2;
16 return 1;
17 fi;
18
19 if [[ -f "$1" ]]; then
20 # Use specified file
21 input="$1";
22 else
23 # Use standard input
24 input="-";
25 fi;
26
27 # Perform substitutions
7e653610
SBS
28 ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions
29 ## Note: Order of replacements sorted most-specific first.
265d3d9a 30 sed -E \
7e653610
SBS
31 -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
32 -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
33 -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
34 -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
35 -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
36 -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
37 -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
38 -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \
39 -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \
40 -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \
41 -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
42 -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \
43 -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
44 -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \
45 -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
46 -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
eb9061ae 47 -e "s/(where|Where|WHERE)'(d|D|s|S)/\1ʼ\2/g" \
7e653610
SBS
48 -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
49 -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \
50 -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
51 -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \
52 -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
53 -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
eb9061ae 54 -e "s/(wasn|Wasn|WASN)'(t|T)/\1ʼ\2/g" \
7e653610
SBS
55 -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
56 -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
57 -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
58 -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \
59 -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
60 -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
61 -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
62 -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \
63 -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
64 -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
65 -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \
e424b710 66 -e "s/'(s|S)\b/ʼ\1/g" \
595c2cf9 67 -e "s/(s|S)'( |,|.|$)/\1ʼ\2/g" \
7e653610
SBS
68 -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
69 -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \
eb9061ae 70 -e "s/(o|O)'(clock|CLOCK)/\1ʼ\2/g" \
7e653610
SBS
71 -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
72 -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
73 -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
74 -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
75 -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \
76 -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \
77 -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \
78 -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
79 -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
80 -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \
81 -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
82 -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
83 -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \
eb9061ae 84 -e "s/(how|How|HOW)'(d|D)/\1ʼ\2/g" \
7e653610
SBS
85 -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
86 -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
87 -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \
88 -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \
89 -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
90 -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \
eb9061ae 91 -e "s/'(em\b)/ʼ\1/g" \
7e653610 92 -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \
595c2cf9 93 -e "s/([[:alnum:]])'(d|D)/\1ʼ\2/g" \
7e653610
SBS
94 -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \
95 -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \
96 -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \
97 -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \
98 -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
99 -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \
eb9061ae 100 -e "s/(c|C)'(mere|MERE)/\1ʼ\2/g" \
7e653610
SBS
101 -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
102 -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \
eb9061ae
SBS
103 -e "s/'(cause|Cause|CAUSE)/ʼ\1/g" \
104 -e "s/'(bout|Bout|BOUT)/ʼ\1/g" \
7e653610
SBS
105 -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \
106 -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \
107 -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \
595c2cf9 108 -e "s/([[:alpha:]])'([[:alpha:]])/\1ʼ\2/g" \
7e653610 109 "$input"
011613d8 110}; # replace ' with ʼ
4aa4ed1b
SBS
111
112# Author: Steven Baltakatei Sandoval
113# License: GPLv3+