From: Steven Baltakatei Sandoval Date: Thu, 25 Jan 2024 22:59:02 +0000 (+0000) Subject: feat(unitproc/bkt-replace_contractions):Use backreferences X-Git-Url: https://zdv2.bktei.com/gitweb/BK-2020-03.git/commitdiff_plain/7e653610fa473edda70e211a3daf47ef69a2e834 feat(unitproc/bkt-replace_contractions):Use backreferences - Note: Various more contractions added. They won't cover something crazy like Mark Twain's novels that attempt to model accents with copious apostrophe use, but the script should cover most commonly used contractions in almost scholarly publications. --- diff --git a/unitproc/bkt-replace_contractions b/unitproc/bkt-replace_contractions index fe1d5d6..94731e3 100644 --- a/unitproc/bkt-replace_contractions +++ b/unitproc/bkt-replace_contractions @@ -8,7 +8,7 @@ replace_contractions() { # Input: stdin # arg1 file path # Output: stdout - # Version: 0.0.1 + # Version: 0.1.0 # Depends: GNU sed 4.8 # Check input @@ -26,26 +26,75 @@ replace_contractions() { fi; # Perform substitutions + ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions + ## Note: Order of replacements sorted most-specific first. sed -E \ - -e "s/(you're|You're|YOU'RE)/youʼre/gI" \ - -e "s/(i'm|I'm|I'M)/Iʼm/gI" \ - -e "s/(you've|You've|YOU'VE)/youʼve/gI" \ - -e "s/(they're|They're|THEY'RE)/theyʼre/gI" \ - -e "s/(we're|We're|WE'RE)/weʼre/gI" \ - -e "s/(they've|They've|THEY'VE)/theyʼve/gI" \ - -e "s/(we've|We've|WE'VE)/weʼve/gI" \ - -e "s/(i've|I've|I'VE)/Iʼve/gI" \ - -e "s/(that's|That's|THAT'S)/thatʼs/gI" \ - -e "s/(what's|What's|WHAT'S)/whatʼs/gI" \ - -e "s/(here's|Here's|HERE'S)/hereʼs/gI" \ - -e "s/(there's|There's|THERE'S)/thereʼs/gI" \ - -e "s/(where's|Where's|WHERE'S)/whereʼs/gI" \ - -e "s/(who's|Who's|WHO'S)/whoʼs/gI" \ - -e "s/(how's|How's|HOW'S)/howʼs/gI" \ - -e "s/(doesn't|Doesn't|DOESN'T)/doesnʼt/gI" \ - -e "s/(don't|Don't|DON'T)/donʼt/gI" \ - -e "s/(i'll|I'll|I'LL)/Iʼll/gI" \ - -e "s/(we'll|We'll|WE'LL)/weʼll/gI" \ - -e "s/(they'll|They'll|THEY'LL)/theyʼll/gI" \ - "$input"; + -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \ + -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(where|Where|WHERE)'(s|S)/\1ʼ\2/g" \ + -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \ + -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \ + -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \ + -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \ + -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \ + -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \ + -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \ + "$input" }; # replace ' with ʼ in contractions