]> zdv2.bktei.com Git - BK-2020-03.git/blobdiff - unitproc/bkt-replace_apos
chore(unitproc/find_erotica.sh):Add debug lines
[BK-2020-03.git] / unitproc / bkt-replace_apos
index 1ba50fc2214bc8d4270a5a3f8370b73cbaffa237..d19ae27984b0488fe1822e34cfc3aa18d641be54 100644 (file)
@@ -1,14 +1,14 @@
 #!/bin/bash
 
-# Function to replace contractions
-function replace_apostrophes() {
-    # Desc: Replace ' with ʼ in contractions
-    # Note: In contractions of UTF-8 text file, replaces U+0027
+function replace_apos() {
+    # Desc: Replace ' with ʼ in text
+    # Usage: source bkt-replace_apos; replace_apos [FILE]
+    # Note: In UTF-8 text file, replaces U+0027
     #   APOSTROPHE with U+02BC MODIFIER LETTER APOSTROPHE
     # Input: stdin
     #        arg1   file path
     # Output: stdout
-    # Version: 1.1.0 (BK-2020-03)
+    # Version: 1.3.0 (BK-2020-03)
     # Depends: GNU sed 4.8
 
     # Check input
@@ -29,84 +29,86 @@ function replace_apostrophes() {
     ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions
     ## Note: Order of replacements sorted most-specific first.
     sed -E \
-        -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
-        -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
-        -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
-        -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
-        -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
-        -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
-        -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \
-        -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \
-        -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \
-        -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
-        -e "s/(where|Where|WHERE)'(d|D|s|S)/\1ʼ\2/g" \
-        -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
-        -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \
-        -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \
-        -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
-        -e "s/(wasn|Wasn|WASN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
-        -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \
-        -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
-        -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
-        -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \
-        -e "s/'(s|S)\b/ʼ\1/g" \
-        -e "s/(s|S)'( |$)/\1ʼ\2/g" \
-        -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(o|O)'(clock|CLOCK)/\1ʼ\2/g" \
-        -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \
-        -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \
-        -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
-        -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
-        -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \
-        -e "s/(how|How|HOW)'(d|D)/\1ʼ\2/g" \
-        -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \
-        -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \
-        -e "s/'(em\b)/ʼ\1/g" \
-        -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \
-        -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \
-        -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \
-        -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(c|C)'(mere|MERE)/\1ʼ\2/g" \
-        -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \
-        -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \
-        -e "s/'(cause|Cause|CAUSE)/ʼ\1/g" \
-        -e "s/'(bout|Bout|BOUT)/ʼ\1/g" \
-        -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \
-        -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \
-        -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \
+        -e "s/(you|You|YOU)['’](ren|REN|ven|VEN)['’](t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(you|You|YOU)['’](dn|DN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(you|You|YOU)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(you|You|YOU)['’](d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(y|Y)['’](all|ALL)['’](dn|DN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
+        -e "s/(y|Y)['’](all|ALL)['’](d|D)['’](nt|NT)['’](ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \
+        -e "s/(y|Y)['’](all|ALL)['’](d|D)['’](ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(y|Y)['’](all|ALL)['’](d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(y|Y)['’](all|ALL)/\1ʼ\2/g" \
+        -e "s/(y|Y)['’](ain|AIN)['’](t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(wouldn|Wouldn|WOULDN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(wouldn|Wouldn|WOULDN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(won|Won|WON)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(won|Won|WON)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(who|Who|WHO)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(who|Who|WHO)['’](d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(where|Where|WHERE)['’](d|D|s|S)/\1ʼ\2/g" \
+        -e "s/(what|What|WHAT)['’](ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(weren|Weren|WEREN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(we|We|WE)['’](ven|VEN)['’](t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)['’](ren|REN)['’](t|T)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(we|We|WE)['’](d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(wasn|Wasn|WASN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(they|They|THEY)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(they|They|THEY)['’](d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \
+        -e "s/(there|There|THERE)['’](ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(there|There|THERE)['’](s|S|ve|VE)/\1ʼ\2/g" \
+        -e "s/(that|That|THAT)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(that|That|THAT)['’](d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(shouldn|Shouldn|SHOULDN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(shouldn|Shouldn|SHOULDN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(she|She|SHE)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(she|She|SHE)['’](d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(shan|Shan|SHAN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/['’](s|S)\b/ʼ\1/g" \
+        -e "s/(s|S)['’]( |,|.|$)/\1ʼ\2/g" \
+        -e "s/(oughtn|Oughtn|OUGHTN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(oughtn|Oughtn|OUGHTN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(o|O)['’](clock|CLOCK)/\1ʼ\2/g" \
+        -e "s/(mustn|Mustn|MUSTN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(mustn|Mustn|MUSTN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(mustn|Mustn|MUSTN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(mightn|Mightn|MIGHTN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(mightn|Mightn|MIGHTN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(might|Might|MIGHT)['’](ve|VE)/\1ʼ\2/g" \
+        -e "s/(let|Let|LET)['’](s|S)/\1ʼ\2/g" \
+        -e "s/(it|It|IT)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(it|It|IT)['’](d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(isn|Isn|ISN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(I|i)['’](dn|DN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \
+        -e "s/(I|i)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(I|i)['’](d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \
+        -e "s/(how|How|HOW)['’](d|D)/\1ʼ\2/g" \
+        -e "s/(he|He|HE)['’](d|D|ll|LL)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(he|He|HE)['’](d|D|ll|LL|s|S)/\1ʼ\2/g" \
+        -e "s/(haven|Haven|HAVEN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(hasn|Hasn|HASN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(hadn|Hadn|HADN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(hadn|Hadn|HADN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/['’](em\b)/ʼ\1/g" \
+        -e "s/(d|D)['’](ya|YA|you|YOU)/\1ʼ\2/g" \
+        -e "s/([[:alnum:]])['’](d|D)/\1ʼ\2/g" \
+        -e "s/(don|Don|DON)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(doesn|Doesn|DOESN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(didn|Didn|DIDN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(could|Could|COULD)['’](ve|VE)/\1ʼ\2/g" \
+        -e "s/(couldn|Couldn|COULDN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(couldn|Couldn|COULDN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(c|C)['’](mere|MERE)/\1ʼ\2/g" \
+        -e "s/(can|Can|CAN)['’](t|T)['’](ve|VE)/\1ʼ\2ʼ\3/g" \
+        -e "s/(can|Can|CAN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/['’](cause|Cause|CAUSE)/ʼ\1/g" \
+        -e "s/['’](bout|Bout|BOUT)/ʼ\1/g" \
+        -e "s/(aren|Aren|AREN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/(ate|Ate|ATE)['’](nt|NT)/\1ʼ\2/g" \
+        -e "s/(ain|Ain|AIN)['’](t|T)/\1ʼ\2/g" \
+        -e "s/([[:alpha:]])['’]([[:alpha:]])/\1ʼ\2/g" \
         "$input"
-}; # replace ' with ʼ in contractions
+}; # replace ' with ʼ
 
 # Author: Steven Baltakatei Sandoval
 # License: GPLv3+