From 7e653610fa473edda70e211a3daf47ef69a2e834 Mon Sep 17 00:00:00 2001 From: Steven Baltakatei Sandoval Date: Thu, 25 Jan 2024 22:59:02 +0000 Subject: [PATCH 1/1] feat(unitproc/bkt-replace_contractions):Use backreferences - Note: Various more contractions added. They won't cover something crazy like Mark Twain's novels that attempt to model accents with copious apostrophe use, but the script should cover most commonly used contractions in almost scholarly publications. --- unitproc/bkt-replace_contractions | 93 +++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 22 deletions(-) diff --git a/unitproc/bkt-replace_contractions b/unitproc/bkt-replace_contractions index fe1d5d6..94731e3 100644 --- a/unitproc/bkt-replace_contractions +++ b/unitproc/bkt-replace_contractions @@ -8,7 +8,7 @@ replace_contractions() { # Input: stdin # arg1 file path # Output: stdout - # Version: 0.0.1 + # Version: 0.1.0 # Depends: GNU sed 4.8 # Check input @@ -26,26 +26,75 @@ replace_contractions() { fi; # Perform substitutions + ## Note: See https://en.wiktionary.org/wiki/Category:English_contractions + ## Note: Order of replacements sorted most-specific first. sed -E \ - -e "s/(you're|You're|YOU'RE)/youʼre/gI" \ - -e "s/(i'm|I'm|I'M)/Iʼm/gI" \ - -e "s/(you've|You've|YOU'VE)/youʼve/gI" \ - -e "s/(they're|They're|THEY'RE)/theyʼre/gI" \ - -e "s/(we're|We're|WE'RE)/weʼre/gI" \ - -e "s/(they've|They've|THEY'VE)/theyʼve/gI" \ - -e "s/(we've|We've|WE'VE)/weʼve/gI" \ - -e "s/(i've|I've|I'VE)/Iʼve/gI" \ - -e "s/(that's|That's|THAT'S)/thatʼs/gI" \ - -e "s/(what's|What's|WHAT'S)/whatʼs/gI" \ - -e "s/(here's|Here's|HERE'S)/hereʼs/gI" \ - -e "s/(there's|There's|THERE'S)/thereʼs/gI" \ - -e "s/(where's|Where's|WHERE'S)/whereʼs/gI" \ - -e "s/(who's|Who's|WHO'S)/whoʼs/gI" \ - -e "s/(how's|How's|HOW'S)/howʼs/gI" \ - -e "s/(doesn't|Doesn't|DOESN'T)/doesnʼt/gI" \ - -e "s/(don't|Don't|DON'T)/donʼt/gI" \ - -e "s/(i'll|I'll|I'LL)/Iʼll/gI" \ - -e "s/(we'll|We'll|WE'LL)/weʼll/gI" \ - -e "s/(they'll|They'll|THEY'LL)/theyʼll/gI" \ - "$input"; + -e "s/(you|You|YOU)'(ren|REN|ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(you|You|YOU)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(y|Y)'(all|ALL)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(nt|NT)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4ʼ\5/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(y|Y)'(all|ALL)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(y|Y)'(all|ALL)/\1ʼ\2/g" \ + -e "s/(y|Y)'(ain|AIN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(wouldn|Wouldn|WOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(won|Won|WON)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(won|Won|WON)'(t|T)/\1ʼ\2/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(who|Who|WHO)'(d|D|ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(where|Where|WHERE)'(s|S)/\1ʼ\2/g" \ + -e "s/(what|What|WHAT)'(ll|LL|re|RE|s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(weren|Weren|WEREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(we|We|WE)'(ven|VEN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(ren|REN)'(t|T)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(we|We|WE)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(they|They|THEY)'(d|D|ll|LL|re|RE|ve|VE)/\1ʼ\2/g" \ + -e "s/(there|There|THERE)'(ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(there|There|THERE)'(s|S|ve|VE)/\1ʼ\2/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(that|That|THAT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(shouldn|Shouldn|SHOULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(she|She|SHE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(shan|Shan|SHAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(oughtn|Oughtn|OUGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mustn|Mustn|MUSTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(mightn|Mightn|MIGHTN)'(t|T)/\1ʼ\2/g" \ + -e "s/(might|Might|MIGHT)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(let|Let|LET)'(s|S)/\1ʼ\2/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(it|It|IT)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(isn|Isn|ISN)'(t|T)/\1ʼ\2/g" \ + -e "s/(I|i)'(dn|DN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3ʼ\4/g" \ + -e "s/(I|i)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(I|i)'(d|D|ll|LL|m|M|ve|VE)/\1ʼ\2/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(he|He|HE)'(d|D|ll|LL|s|S)/\1ʼ\2/g" \ + -e "s/(haven|Haven|HAVEN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hasn|Hasn|HASN)'(t|T)/\1ʼ\2/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(hadn|Hadn|HADN)'(t|T)/\1ʼ\2/g" \ + -e "s/(d|D)'(ya|YA|you|YOU)/\1ʼ\2/g" \ + -e "s/(don|Don|DON)'(t|T)/\1ʼ\2/g" \ + -e "s/(doesn|Doesn|DOESN)'(t|T)/\1ʼ\2/g" \ + -e "s/(didn|Didn|DIDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(could|Could|COULD)'(ve|VE)/\1ʼ\2/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(couldn|Couldn|COULDN)'(t|T)/\1ʼ\2/g" \ + -e "s/(can|Can|CAN)'(t|T)'(ve|VE)/\1ʼ\2ʼ\3/g" \ + -e "s/(can|Can|CAN)'(t|T)/\1ʼ\2/g" \ + -e "s/(aren|Aren|AREN)'(t|T)/\1ʼ\2/g" \ + -e "s/(ate|Ate|ATE)'(nt|NT)/\1ʼ\2/g" \ + -e "s/(ain|Ain|AIN)'(t|T)/\1ʼ\2/g" \ + "$input" }; # replace ' with ʼ in contractions -- 2.30.2