feat(unitproc/bkshuf):Exit early if stdout not being read
[BK-2020-03.git] / unitproc / bkshuf
1 #!/usr/bin/env bash
2 # Desc: Mixes input lines while also preserving some neighbors
3 # Usage: cat file | bkshuf arg1
4 # Version 0.2.0
5 # Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf)
6 # Input: var: arg1 initial lines to output
7
8 trap 'exit;' SIGPIPE; # exit early if stdout not being read
9
10 # Load env vars
11 ## For these numbers of lines of input...
12 if [[ ! -v BKSHUF_PARAM_LINEC ]]; then BKSHUF_PARAM_LINEC=1000000; fi;
13 ## ... target this group size.
14 if [[ ! -v BKSHUF_PARAM_GSIZE ]]; then BKSHUF_PARAM_GSIZE=25; fi;
15
16
17 yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
18 die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
19 must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
20 read_stdin() {
21 # Desc: Consumes stdin; outputs as stdout lines
22 # Input: stdin (consumes)
23 # Output: stdout (newline delimited)
24 # Example: printf "foo\nbar\n" | read_stdin
25 # Depends: GNU bash (version 5.1.16)
26 # Version: 0.0.1
27 local input_stdin output;
28
29 # Store stdin
30 if [[ -p /dev/stdin ]]; then
31 input_stdin="$(cat -)";
32 fi;
33
34 # Store as output array elements
35 ## Read in stdin
36 if [[ -n $input_stdin ]]; then
37 while read -r line; do
38 output+=("$line");
39 done < <(printf "%s\n" "$input_stdin");
40 fi;
41
42 # Print to stdout
43 printf "%s\n" "${output[@]}";
44 }; # read stdin to stdout lines
45 checkInt() {
46 # Desc: Checks if arg is integer
47 # Usage: checkInt arg
48 # Input: arg: integer
49 # Output: - return code 0 (if arg is integer)
50 # - return code 1 (if arg is not integer)
51 # Example: if ! checkInt $arg; then echo "not int"; fi;
52 # Version: 0.0.1
53 local returnState
54
55 #===Process Arg===
56 if [[ $# -ne 1 ]]; then
57 die "ERROR:Invalid number of arguments:$#";
58 fi;
59
60 RETEST1='^[0-9]+$'; # Regular Expression to test
61 if [[ ! $1 =~ $RETEST1 ]] ; then
62 returnState="false";
63 else
64 returnState="true";
65 fi;
66
67 #===Determine function return code===
68 if [ "$returnState" = "true" ]; then
69 return 0;
70 else
71 return 1;
72 fi;
73 } # Checks if arg is integer
74 consume_line() {
75 # Desc: Outputs and destroys line from list_input starting at index ip
76 # Usage: consume_line;
77 # Input: var: list_input array input lines
78 # var: lco int line count original
79 # var: lcr int line count remaining
80 # var: ip int list_input index pointer
81 # Output: stdout: a single non-blank element from list_input
82 # list_input one element destroyed
83 # var: lc_out incremented once
84 local n line;
85
86 n=0; # for tracking progress iterating through remaining list_input
87 ### Loop once through list_input indices until a non-blank line found
88 while [[ $n -le $lco ]]; do
89 #### check if line at ip is blank
90 line="${list_input[$ip]}";
91 if [[ -n $line ]]; then
92 ##### consume line at index ip
93 printf "%s\n" "$line"; # print to output
94
95 unset "list_input[$ip]"; # destroy line in list_input array
96 ((lc_out++));
97 ((lcr--)); # decrement line count remaining lcr
98 #yell "DEBUG:Consumed line ip:$ip:$line";
99 break;
100 fi;
101 #### increment input index pointer, looping around if necessary
102 if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi;
103 #### track progress through list_input
104 ((n++));
105 done;
106 }; # consume and output line in list_input array starting at index ip
107 decide_read() {
108 # Desc: Decides whether to read another line in list_input array
109 # by comparing $RANDOM to p_seq_int
110 # Usage: if decide_read; then something; fi;
111 # Input: var: p_seq_int probability (int [0 32767])
112
113 if [[ $RANDOM -lt $p_seq_int ]]; then
114 return 0;
115 else
116 return 1;
117 fi;
118 }; # returns 0 with probability p_seq; 1 otherwise
119
120 main() {
121 declare par_l0 par_s0 s_exp s;
122 declare -a list_input;
123
124 # Check positional arguments
125 if [[ $# -gt 0 ]] && ! checkInt "$1"; then
126 die "FATAL:Not an integer:$1";
127 else
128 lc_out_max="$1"; # output line count
129 fi;
130
131 # Check env vars
132 if ! checkInt "$BKSHUF_PARAM_LINEC"; then
133 die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_LINEC"; fi;
134 if ! checkInt "$BKSHUF_PARAM_GSIZE"; then
135 die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_GSIZE"; fi;
136
137 # store input lines from stdin (like `shuf`)
138 while read -r line; do
139 if [[ -z $line ]]; then continue; fi; # skip blank lines
140 #yell "DEBUG:INPUT:$line";
141 list_input+=("$line");
142 done < <( read_stdin; );
143
144 # calc line count (lc)
145 lc="${#list_input[@]}";
146 #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )";
147 #yell "DEBUG:lc:$lc";
148 #yell "DEBUG:list_input:$(declare -p list_input)";
149
150 # calculate group size s
151 par_l0="$BKSHUF_PARAM_LINEC";
152 par_s0="$BKSHUF_PARAM_GSIZE";
153 s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1";
154 s="$(echo "scale=12; $s_exp" | bc -l)";
155
156 # calculate probabilities p_jump, p_seq
157 p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)";
158 p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)";
159 p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range
160
161 # generate output
162 lco="${#list_input[@]}"; # save original input line count
163 lcr="$lco";
164 lc_out="0"; # init output line counter
165 if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi;
166 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
167 RANDOM="$(shuf -i0-32767 -n1)"; # init Bash PRNG
168 n_loop1="0";
169 #yell "DEBUG:max_blanks:$max_blanks"
170 while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
171 ## Select random unconsumed input line and consume it to output
172 ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer
173 consume_line;
174
175 ## Consume the next sequential line with probability p_seq.
176 while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do
177 consume_line;
178 done;
179
180 ((n_loop1++)); # increment jump counter
181 done;
182 #yell "DEBUG:n_loop1:$n_loop1"; # count jumps
183
184 }; # main program
185
186 main "$@";
187
188 # Author: Steven Baltakatei Sandoval
189 # License: GPLv3+
190
191 # Dependency information
192
193 # bc 1.07.1
194 # Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc.
195
196
197 # shuf (GNU coreutils) 8.32
198 # Copyright (C) 2020 Free Software Foundation, Inc.
199 # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>.
200 # This is free software: you are free to change and redistribute it.
201 # There is NO WARRANTY, to the extent permitted by law.
202
203 # Written by Paul Eggert.
204
205