1 # Date: 2020-02-21T21:23Z;
 
   3 # Author: Steven Baltakatei Sandoval
 
   5 # License: This bash script, 'bkcsvjoin.sh', is licensed under GPLv3 or
 
   6 # later by Steven Baltakatei Sandoval:
 
   8 #    'bkcsvjoin.sh' sorts and joins two CSV files based on first field
 
   9 #    Copyright (C) 2020  Steven Baltakatei Sandoval (baltakatei.com)
 
  11 #    This program is free software: you can redistribute it and/or modify
 
  12 #    it under the terms of the GNU General Public License as published by
 
  13 #    the Free Software Foundation, either version 3 of the License, or
 
  16 #    This program is distributed in the hope that it will be useful,
 
  17 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 
  18 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
  19 #    GNU General Public License for more details.
 
  21 #    A copy of the GNU General Public License may be found at
 
  22 #    <https://www.gnu.org/licenses/>.
 
  24 # Description: Sorts and joins two CSV files based upon the contents
 
  25 # of the first field ("field1") in each line of CSV file. It assumes
 
  26 # the first line is a header labelling field names. It outputs several
 
  27 # files into a new directory in the working directory. Set theory
 
  28 # characters in output file names indicate file 1 elements with "A"
 
  29 # and file 2 elements with "B". Input assumes UNIX-style line
 
  30 # endings. The output files use the following numbers and definitions:
 
  32 #    1. A⋃B     The union of A and B. Contains all field1 elements from file 1 and 2.
 
  34 #    2. A⋃(A⋂B) Contains all field1 elements of file 1 plus the matches between
 
  37 #    3. B⋃(B⋂A) Contains all field1 elements of file 2 plus the matches between
 
  40 #    4. A⋂B     Contains only field1 elements present in both file 1 and file 2.
 
  42 #    5. A-B     Contains all field1 elements of file 1 except when a matching
 
  43 #               element is present in file 2.
 
  45 #    6. B-A     Contains all field1 elements of file 2 except when a matching
 
  46 #               element is present in file 1.
 
  48 #    7. A⊖B     Contains all field1 elements of file 1 and file 2 except for
 
  49 #               elements which are present in both file 1 and 2.
 
  51 # Usage: bkcsvjoin.sh  [ file1 ]  [ file2 ] > fileOut.csv
 
  53 # Dependencies: GNU Coreutils 8.30-3 (bash, join, sort, head, tail,
 
  54 # mkdir) (see end of file for license details). Note: only tested on
 
  55 # Debian GNU/Linux 10.
 
  57 echoerr() { echo "$@" 1>&2; } # Function for outputing text to stderr.
 
  59 SCRIPT_DATE=$(date +%Y%m%dT%H%M%S%z)
 
  62 # Create temporary working directory
 
  63 TEMP_DIRPATH="$SCRIPT_PATH"/"$SCRIPT_DATE"..temp
 
  66 # Read file names as input arguments
 
  69 FILE1_BASENAME=$(basename "$FILE1_PATH")
 
  70 FILE2_BASENAME=$(basename "$FILE2_PATH")
 
  71 #[[ -f "$FILE1_PATH" ]] && echoerr "DEBUG:$FILE1_PATH is file with basename $FILE1_BASENAME."
 
  72 #[[ -f "$FILE2_PATH" ]] && echoerr "DEBUG:$FILE2_PATH is file with basename $FILE2_BASENAME."
 
  74 # Warn and prompt if carriage return character is used.
 
  75 if [[ $(grep -Uc $'\x0D' "$FILE1_PATH") -gt 0 ]] || [[ $(grep -Uc $'\x0D' "$FILE2_PATH") -gt 0 ]]; then
 
  76     echoerr "ERROR: Carriage Return character(s) detected. Windows text file?"
 
  77     read -p "Do you wish to attempt to convert DOS-style line endings to UNIX-style by removing all carriage returns? (probably okay) [y/n]" choice
 
  80               OPTION_DOS2UNIX="true"
 
  83               OPTION_DOS2UNIX="false"
 
  84               echoerr "WARNING:Output may contain mix of DOS-style (CRLF) and UNIX-style (LF) line endings!"
 
  85               echoerr "  See https://stackoverflow.com/a/2613834 "
 
  88         * ) echo "Invalid selection."
 
  89             echoerr "Exiting." && exit 1;; 
 
  93 # Sort files on field1 and write to temporary directory. See [1], and [2].
 
  94 FILE1_SORTED_PATH="$TEMP_DIRPATH"/"$FILE1_BASENAME"_SORTED.csv
 
  95 FILE2_SORTED_PATH="$TEMP_DIRPATH"/"$FILE2_BASENAME"_SORTED.csv
 
  96 ( head -n1 $FILE1_PATH && tail -n+2 $FILE1_PATH | sort -t, -k1,1 | uniq ) > $FILE1_SORTED_PATH
 
  97 ( head -n1 $FILE2_PATH && tail -n+2 $FILE2_PATH | sort -t, -k1,1 | uniq ) > $FILE2_SORTED_PATH
 
  99 # 1. A⋃B. Join files into Union of (file 1) + (file 2) (i.e. throwing nothing out). See [3], [6].
 
 100 FILES_JOINED_COMPLETE="$( join -t, -a1 -a2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 102 # 2. A⋃(A⋂B). Join files, file 1 uniques only (omit unmatched field1 entries from file 2). See [6].
 
 103 FILES_JOINED_F1UO="$( join -t, -a1 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 105 # 3. B⋃(A⋂B). Join files, file 2 uniques only (omit unmatched field1 entries from file 1). See [6].
 
 106 FILES_JOINED_F2UO="$( join -t, -a2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 108 # 4. A⋂B. Join files, only entries present in both file 1 and file 2 ( file 1 ⋂ file 2 ). See [6].
 
 109 FILES_JOINED_INTERSECTION="$( join -t, -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 111 # 5. A-B. Join files into Relative Complement of (file 1) - (file 2) field1 entries. See See [5], [6].
 
 112 FILES_JOINED_DIFFERENCE_AB="$( join -t, -v1 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 114 # 6. B-A. Join files into Relative Complement of (file 2) - (file 1) field1 entries. See [5], [6].
 
 115 FILES_JOINED_DIFFERENCE_BA="$( join -t, -v2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 117 # 7. A⊖B. Join files into Symmetric Difference of (file 2) ⊖ (file 1) field1 entries. See [5], [6].
 
 118 FILES_JOINED_DIFFERENCE_SYM="$( join -t, -v1 -v2 -o auto --header "$FILE1_SORTED_PATH" "$FILE2_SORTED_PATH" )"
 
 120 # Attempt to restore to UNIX-style line endings by removing all carriage return (CR) characters OPTION_DOS2UNIX set to true. See [7].
 
 121 if [[ $OPTION_DOS2UNIX == "true" ]]; then
 
 122     FILES_JOINED_COMPLETE="$(echo "$FILES_JOINED_COMPLETE" | tr -d '\r')"
 
 123     FILES_JOINED_F1UO="$(echo "$FILES_JOINED_F1UO" | tr -d '\r')"
 
 124     FILES_JOINED_F2UO="$(echo "$FILES_JOINED_F2UO" | tr -d '\r')"
 
 125     FILES_JOINED_INTERSECTION="$(echo "$FILES_JOINED_INTERSECTION" | tr -d '\r')"
 
 126     FILES_JOINED_DIFFERENCE_AB="$(echo "$FILES_JOINED_DIFFERENCE_AB" | tr -d '\r')"
 
 127     FILES_JOINED_DIFFERENCE_BA="$(echo "$FILES_JOINED_DIFFERENCE_BA" | tr -d '\r')"
 
 128     FILES_JOINED_DIFFERENCE_SYM="$(echo "$FILES_JOINED_DIFFERENCE_SYM" | tr -d '\r')"
 
 131 # Output CSV files to temp directory with carriage returns ('\r') removed.
 
 132 echo "$FILES_JOINED_COMPLETE" > "$TEMP_DIRPATH"/"1.A⋃B..UNION.csv" && echoerr "1.A⋃B..UNION.csv written to $TEMP_DIRPATH"
 
 133 echo "$FILES_JOINED_F1UO" > "$TEMP_DIRPATH"/"2.A⋃(A⋂B)..FILE-1-UNIQUES-ONLY.csv" && echoerr "2.A⋃(A⋂B)..FILE-1-UNIQUES-ONLY.csv written to $TEMP_DIRPATH"
 
 134 echo "$FILES_JOINED_F2UO" > "$TEMP_DIRPATH"/"3.B⋃(A⋂B)..FILE-2-UNIQUES-ONLY.csv" && echoerr "3.B⋃(A⋂B)..FILE-2-UNIQUES-ONLY.csv written to $TEMP_DIRPATH"
 
 135 echo "$FILES_JOINED_INTERSECTION" > "$TEMP_DIRPATH"/"4.A⋂B..INTERSECTION.csv" && echoerr "4.A⋂B..INTERSECTION.csv written to $TEMP_DIRPATH"
 
 136 echo "$FILES_JOINED_DIFFERENCE_AB" > "$TEMP_DIRPATH"/"5.A-B..DIFFERENCE_A_MINUS_B.csv" && echoerr "5.A-B..DIFFERENCE_A_MINUS_B.csv written to $TEMP_DIRPATH"
 
 137 echo "$FILES_JOINED_DIFFERENCE_BA" > "$TEMP_DIRPATH"/"6.B-A..DIFFERENCE_B_MINUS_A.csv" && echoerr "6.B-A..DIFFERENCE_B_MINUS_A.csv written to $TEMP_DIRPATH"
 
 138 echo "$FILES_JOINED_DIFFERENCE_SYM" > "$TEMP_DIRPATH"/"7.A⊖B..DIFFERENCE_SYM.csv" && echoerr "7.A⊖B..DIFFERENCE_SYM.csv written to $TEMP_DIRPATH"
 
 141 # [1]: How to sort a csv file while retaining the first line. https://stackoverflow.com/a/14562674
 
 142 # [2]: How to sort a csv file by sorting on a single field. https://stackoverflow.com/a/44744800
 
 143 # [3]: How to join multiple sorted csv files into one. https://stackoverflow.com/a/27606199
 
 144 # [4]: How to join a csv file retaining the first line, sorting by certain fields, and not omitting nonmatching fields. https://superuser.com/a/1527095
 
 145 # [5]: How to determine symmetric difference between two sorted files. https://www.gnu.org/software/coreutils/manual/html_node/Set-operations.html
 
 146 # [6]: Set theory symbols https://www.rapidtables.com/math/symbols/Set_Symbols.html
 
 147 # [7]: How to convert DOS-style to UNIX-style line endings. https://stackoverflow.com/a/2613834
 
 151     # GNU bash, version 5.0.3(1)-release (x86_64-pc-linux-gnu)
 
 152     # Copyright (C) 2019 Free Software Foundation, Inc.
 
 153     # License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
 
 155     # This is free software; you are free to change and redistribute it.
 
 156     # There is NO WARRANTY, to the extent permitted by law.
 
 159     # join (GNU coreutils) 8.30
 
 160     # Copyright (C) 2018 Free Software Foundation, Inc.
 
 161     # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
 
 162     # This is free software: you are free to change and redistribute it.
 
 163     # There is NO WARRANTY, to the extent permitted by law.
 
 165     # Written by Mike Haertel.
 
 168     # sort (GNU coreutils) 8.30
 
 169     # Copyright (C) 2018 Free Software Foundation, Inc.
 
 170     # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
 
 171     # This is free software: you are free to change and redistribute it.
 
 172     # There is NO WARRANTY, to the extent permitted by law.
 
 174     # Written by Mike Haertel and Paul Eggert.
 
 177     # head (GNU coreutils) 8.30
 
 178     # Copyright (C) 2018 Free Software Foundation, Inc.
 
 179     # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
 
 180     # This is free software: you are free to change and redistribute it.
 
 181     # There is NO WARRANTY, to the extent permitted by law.
 
 183     # Written by David MacKenzie and Jim Meyering.
 
 186     # tail (GNU coreutils) 8.30
 
 187     # Copyright (C) 2018 Free Software Foundation, Inc.
 
 188     # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
 
 189     # This is free software: you are free to change and redistribute it.
 
 190     # There is NO WARRANTY, to the extent permitted by law.
 
 192     # Written by Paul Rubin, David MacKenzie, Ian Lance Taylor,
 
 196     # mkdir (GNU coreutils) 8.30
 
 197     # Copyright (C) 2018 Free Software Foundation, Inc.
 
 198     # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
 
 199     # This is free software: you are free to change and redistribute it.
 
 200     # There is NO WARRANTY, to the extent permitted by law.
 
 202     # Written by David MacKenzie.
 
 205 # - Created in task 20200220_90eb0c7d