| 1 | #!/usr/bin/env bash |
| 2 | # Desc: Mixes input lines while also preserving some neighbors |
| 3 | # Usage: cat file | bkshuf arg1 |
| 4 | # Version 0.1.2 |
| 5 | # Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf) |
| 6 | # Input: var: arg1 initial lines to output |
| 7 | |
| 8 | # Load env vars |
| 9 | ## For these numbers of lines of input... |
| 10 | if [[ ! -v BKSHUF_PARAM_LINEC ]]; then BKSHUF_PARAM_LINEC=1000000; fi; |
| 11 | ## ... target this group size. |
| 12 | if [[ ! -v BKSHUF_PARAM_GSIZE ]]; then BKSHUF_PARAM_GSIZE=25; fi; |
| 13 | |
| 14 | |
| 15 | yell() { echo "$0: $*" >&2; } # print script path and all args to stderr |
| 16 | die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status |
| 17 | must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails |
| 18 | read_stdin() { |
| 19 | # Desc: Consumes stdin; outputs as stdout lines |
| 20 | # Input: stdin (consumes) |
| 21 | # Output: stdout (newline delimited) |
| 22 | # Example: printf "foo\nbar\n" | read_stdin |
| 23 | # Depends: GNU bash (version 5.1.16) |
| 24 | # Version: 0.0.1 |
| 25 | local input_stdin output; |
| 26 | |
| 27 | # Store stdin |
| 28 | if [[ -p /dev/stdin ]]; then |
| 29 | input_stdin="$(cat -)"; |
| 30 | fi; |
| 31 | |
| 32 | # Store as output array elements |
| 33 | ## Read in stdin |
| 34 | if [[ -n $input_stdin ]]; then |
| 35 | while read -r line; do |
| 36 | output+=("$line"); |
| 37 | done < <(printf "%s\n" "$input_stdin"); |
| 38 | fi; |
| 39 | |
| 40 | # Print to stdout |
| 41 | printf "%s\n" "${output[@]}"; |
| 42 | }; # read stdin to stdout lines |
| 43 | checkInt() { |
| 44 | # Desc: Checks if arg is integer |
| 45 | # Usage: checkInt arg |
| 46 | # Input: arg: integer |
| 47 | # Output: - return code 0 (if arg is integer) |
| 48 | # - return code 1 (if arg is not integer) |
| 49 | # Example: if ! checkInt $arg; then echo "not int"; fi; |
| 50 | # Version: 0.0.1 |
| 51 | local returnState |
| 52 | |
| 53 | #===Process Arg=== |
| 54 | if [[ $# -ne 1 ]]; then |
| 55 | die "ERROR:Invalid number of arguments:$#"; |
| 56 | fi; |
| 57 | |
| 58 | RETEST1='^[0-9]+$'; # Regular Expression to test |
| 59 | if [[ ! $1 =~ $RETEST1 ]] ; then |
| 60 | returnState="false"; |
| 61 | else |
| 62 | returnState="true"; |
| 63 | fi; |
| 64 | |
| 65 | #===Determine function return code=== |
| 66 | if [ "$returnState" = "true" ]; then |
| 67 | return 0; |
| 68 | else |
| 69 | return 1; |
| 70 | fi; |
| 71 | } # Checks if arg is integer |
| 72 | consume_line() { |
| 73 | # Desc: Outputs and destroys line from list_input starting at index ip |
| 74 | # Usage: consume_line; |
| 75 | # Input: var: list_input array input lines |
| 76 | # var: lco int line count original |
| 77 | # var: lcr int line count remaining |
| 78 | # var: ip int list_input index pointer |
| 79 | # Output: stdout: a single non-blank element from list_input |
| 80 | # list_input one element destroyed |
| 81 | # var: lc_out incremented once |
| 82 | local n line; |
| 83 | |
| 84 | n=0; # for tracking progress iterating through remaining list_input |
| 85 | ### Loop once through list_input indices until a non-blank line found |
| 86 | while [[ $n -le $lco ]]; do |
| 87 | #### check if line at ip is blank |
| 88 | line="${list_input[$ip]}"; |
| 89 | if [[ -n $line ]]; then |
| 90 | ##### consume line at index ip |
| 91 | printf "%s\n" "$line" & # print to output |
| 92 | unset "list_input[$ip]"; # destroy line in list_input array |
| 93 | ((lc_out++)); |
| 94 | ((lcr--)); # decrement line count remaining lcr |
| 95 | #yell "DEBUG:Consumed line ip:$ip:$line"; |
| 96 | break; |
| 97 | fi; |
| 98 | #### increment input index pointer, looping around if necessary |
| 99 | if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi; |
| 100 | #### track progress through list_input |
| 101 | ((n++)); |
| 102 | done; |
| 103 | }; # consume and output line in list_input array starting at index ip |
| 104 | decide_read() { |
| 105 | # Desc: Decides whether to read another line in list_input array |
| 106 | # by comparing $RANDOM to p_seq_int |
| 107 | # Usage: if decide_read; then something; fi; |
| 108 | # Input: var: p_seq_int probability (int [0 32767]) |
| 109 | |
| 110 | if [[ $RANDOM -lt $p_seq_int ]]; then |
| 111 | return 0; |
| 112 | else |
| 113 | return 1; |
| 114 | fi; |
| 115 | }; # returns 0 with probability p_seq; 1 otherwise |
| 116 | |
| 117 | main() { |
| 118 | declare par_l0 par_s0 s_exp s; |
| 119 | declare -a list_input; |
| 120 | |
| 121 | # Check positional arguments |
| 122 | if [[ $# -gt 0 ]] && ! checkInt "$1"; then |
| 123 | die "FATAL:Not an integer:$1"; |
| 124 | else |
| 125 | lc_out_max="$1"; # output line count |
| 126 | fi; |
| 127 | |
| 128 | # Check env vars |
| 129 | if ! checkInt "$BKSHUF_PARAM_LINEC"; then |
| 130 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_LINEC"; fi; |
| 131 | if ! checkInt "$BKSHUF_PARAM_GSIZE"; then |
| 132 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_GSIZE"; fi; |
| 133 | |
| 134 | # store input lines from stdin (like `shuf`) |
| 135 | while read -r line; do |
| 136 | if [[ -z $line ]]; then continue; fi; # skip blank lines |
| 137 | #yell "DEBUG:INPUT:$line"; |
| 138 | list_input+=("$line"); |
| 139 | done < <( read_stdin; ); |
| 140 | |
| 141 | # calc line count (lc) |
| 142 | lc="${#list_input[@]}"; |
| 143 | #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )"; |
| 144 | #yell "DEBUG:lc:$lc"; |
| 145 | #yell "DEBUG:list_input:$(declare -p list_input)"; |
| 146 | |
| 147 | # calculate group size s |
| 148 | par_l0="$BKSHUF_PARAM_LINEC"; |
| 149 | par_s0="$BKSHUF_PARAM_GSIZE"; |
| 150 | s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1"; |
| 151 | s="$(echo "scale=12; $s_exp" | bc -l)"; |
| 152 | |
| 153 | # calculate probabilities p_jump, p_seq |
| 154 | p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)"; |
| 155 | p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)"; |
| 156 | p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range |
| 157 | |
| 158 | # generate output |
| 159 | lco="${#list_input[@]}"; # save original input line count |
| 160 | lcr="$lco"; |
| 161 | lc_out="0"; # init output line counter |
| 162 | if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi; |
| 163 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer |
| 164 | RANDOM="$(shuf -i0-32767 -n1)"; # init Bash PRNG |
| 165 | n_loop1="0"; |
| 166 | #yell "DEBUG:max_blanks:$max_blanks" |
| 167 | while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do |
| 168 | ## Select random unconsumed input line and consume it to output |
| 169 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer |
| 170 | consume_line; |
| 171 | |
| 172 | ## Consume the next sequential line with probability p_seq. |
| 173 | while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do |
| 174 | consume_line; |
| 175 | done; |
| 176 | |
| 177 | ((n_loop1++)); # increment jump counter |
| 178 | done; |
| 179 | #yell "DEBUG:n_loop1:$n_loop1"; # count jumps |
| 180 | |
| 181 | }; # main program |
| 182 | |
| 183 | main "$@"; |
| 184 | |
| 185 | # Author: Steven Baltakatei Sandoval |
| 186 | # License: GPLv3+ |
| 187 | |
| 188 | # Dependency information |
| 189 | |
| 190 | # bc 1.07.1 |
| 191 | # Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc. |
| 192 | |
| 193 | |
| 194 | # shuf (GNU coreutils) 8.32 |
| 195 | # Copyright (C) 2020 Free Software Foundation, Inc. |
| 196 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. |
| 197 | # This is free software: you are free to change and redistribute it. |
| 198 | # There is NO WARRANTY, to the extent permitted by law. |
| 199 | |
| 200 | # Written by Paul Eggert. |
| 201 | |
| 202 | |