Commit | Line | Data |
---|---|---|
22772fe8 SBS |
1 | #!/usr/bin/env bash |
2 | # Desc: Mixes input lines while also preserving some neighbors | |
3 | # Usage: cat file | bkshuf arg1 | |
2ddc75a5 | 4 | # Version 0.2.0 |
22772fe8 SBS |
5 | # Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf) |
6 | # Input: var: arg1 initial lines to output | |
7 | ||
2ddc75a5 SBS |
8 | trap 'exit;' SIGPIPE; # exit early if stdout not being read |
9 | ||
b9e8b771 SBS |
10 | # Load env vars |
11 | ## For these numbers of lines of input... | |
12 | if [[ ! -v BKSHUF_PARAM_LINEC ]]; then BKSHUF_PARAM_LINEC=1000000; fi; | |
13 | ## ... target this group size. | |
14 | if [[ ! -v BKSHUF_PARAM_GSIZE ]]; then BKSHUF_PARAM_GSIZE=25; fi; | |
22772fe8 SBS |
15 | |
16 | ||
17 | yell() { echo "$0: $*" >&2; } # print script path and all args to stderr | |
18 | die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status | |
19 | must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails | |
20 | read_stdin() { | |
21 | # Desc: Consumes stdin; outputs as stdout lines | |
22 | # Input: stdin (consumes) | |
23 | # Output: stdout (newline delimited) | |
24 | # Example: printf "foo\nbar\n" | read_stdin | |
25 | # Depends: GNU bash (version 5.1.16) | |
26 | # Version: 0.0.1 | |
27 | local input_stdin output; | |
28 | ||
29 | # Store stdin | |
30 | if [[ -p /dev/stdin ]]; then | |
31 | input_stdin="$(cat -)"; | |
32 | fi; | |
33 | ||
34 | # Store as output array elements | |
35 | ## Read in stdin | |
36 | if [[ -n $input_stdin ]]; then | |
37 | while read -r line; do | |
38 | output+=("$line"); | |
39 | done < <(printf "%s\n" "$input_stdin"); | |
40 | fi; | |
41 | ||
42 | # Print to stdout | |
43 | printf "%s\n" "${output[@]}"; | |
44 | }; # read stdin to stdout lines | |
45 | checkInt() { | |
46 | # Desc: Checks if arg is integer | |
47 | # Usage: checkInt arg | |
48 | # Input: arg: integer | |
49 | # Output: - return code 0 (if arg is integer) | |
50 | # - return code 1 (if arg is not integer) | |
51 | # Example: if ! checkInt $arg; then echo "not int"; fi; | |
52 | # Version: 0.0.1 | |
53 | local returnState | |
54 | ||
55 | #===Process Arg=== | |
56 | if [[ $# -ne 1 ]]; then | |
57 | die "ERROR:Invalid number of arguments:$#"; | |
58 | fi; | |
59 | ||
60 | RETEST1='^[0-9]+$'; # Regular Expression to test | |
61 | if [[ ! $1 =~ $RETEST1 ]] ; then | |
62 | returnState="false"; | |
63 | else | |
64 | returnState="true"; | |
65 | fi; | |
66 | ||
67 | #===Determine function return code=== | |
68 | if [ "$returnState" = "true" ]; then | |
69 | return 0; | |
70 | else | |
71 | return 1; | |
72 | fi; | |
73 | } # Checks if arg is integer | |
74 | consume_line() { | |
75 | # Desc: Outputs and destroys line from list_input starting at index ip | |
76 | # Usage: consume_line; | |
77 | # Input: var: list_input array input lines | |
78 | # var: lco int line count original | |
79 | # var: lcr int line count remaining | |
80 | # var: ip int list_input index pointer | |
81 | # Output: stdout: a single non-blank element from list_input | |
82 | # list_input one element destroyed | |
83 | # var: lc_out incremented once | |
84 | local n line; | |
85 | ||
86 | n=0; # for tracking progress iterating through remaining list_input | |
87 | ### Loop once through list_input indices until a non-blank line found | |
88 | while [[ $n -le $lco ]]; do | |
89 | #### check if line at ip is blank | |
90 | line="${list_input[$ip]}"; | |
91 | if [[ -n $line ]]; then | |
92 | ##### consume line at index ip | |
2ddc75a5 SBS |
93 | printf "%s\n" "$line"; # print to output |
94 | ||
22772fe8 SBS |
95 | unset "list_input[$ip]"; # destroy line in list_input array |
96 | ((lc_out++)); | |
97 | ((lcr--)); # decrement line count remaining lcr | |
98 | #yell "DEBUG:Consumed line ip:$ip:$line"; | |
99 | break; | |
100 | fi; | |
101 | #### increment input index pointer, looping around if necessary | |
102 | if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi; | |
103 | #### track progress through list_input | |
104 | ((n++)); | |
105 | done; | |
106 | }; # consume and output line in list_input array starting at index ip | |
107 | decide_read() { | |
108 | # Desc: Decides whether to read another line in list_input array | |
109 | # by comparing $RANDOM to p_seq_int | |
110 | # Usage: if decide_read; then something; fi; | |
111 | # Input: var: p_seq_int probability (int [0 32767]) | |
112 | ||
113 | if [[ $RANDOM -lt $p_seq_int ]]; then | |
114 | return 0; | |
115 | else | |
116 | return 1; | |
117 | fi; | |
118 | }; # returns 0 with probability p_seq; 1 otherwise | |
119 | ||
120 | main() { | |
121 | declare par_l0 par_s0 s_exp s; | |
122 | declare -a list_input; | |
123 | ||
124 | # Check positional arguments | |
125 | if [[ $# -gt 0 ]] && ! checkInt "$1"; then | |
126 | die "FATAL:Not an integer:$1"; | |
127 | else | |
128 | lc_out_max="$1"; # output line count | |
129 | fi; | |
48dab430 SBS |
130 | |
131 | # Check env vars | |
132 | if ! checkInt "$BKSHUF_PARAM_LINEC"; then | |
133 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_LINEC"; fi; | |
134 | if ! checkInt "$BKSHUF_PARAM_GSIZE"; then | |
135 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_GSIZE"; fi; | |
22772fe8 SBS |
136 | |
137 | # store input lines from stdin (like `shuf`) | |
138 | while read -r line; do | |
139 | if [[ -z $line ]]; then continue; fi; # skip blank lines | |
140 | #yell "DEBUG:INPUT:$line"; | |
141 | list_input+=("$line"); | |
142 | done < <( read_stdin; ); | |
143 | ||
144 | # calc line count (lc) | |
145 | lc="${#list_input[@]}"; | |
146 | #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )"; | |
147 | #yell "DEBUG:lc:$lc"; | |
148 | #yell "DEBUG:list_input:$(declare -p list_input)"; | |
149 | ||
150 | # calculate group size s | |
151 | par_l0="$BKSHUF_PARAM_LINEC"; | |
152 | par_s0="$BKSHUF_PARAM_GSIZE"; | |
153 | s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1"; | |
154 | s="$(echo "scale=12; $s_exp" | bc -l)"; | |
155 | ||
156 | # calculate probabilities p_jump, p_seq | |
157 | p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)"; | |
158 | p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)"; | |
159 | p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range | |
160 | ||
161 | # generate output | |
162 | lco="${#list_input[@]}"; # save original input line count | |
163 | lcr="$lco"; | |
164 | lc_out="0"; # init output line counter | |
165 | if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi; | |
166 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer | |
57357d43 | 167 | RANDOM="$(shuf -i0-32767 -n1)"; # init Bash PRNG |
22772fe8 SBS |
168 | n_loop1="0"; |
169 | #yell "DEBUG:max_blanks:$max_blanks" | |
170 | while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do | |
171 | ## Select random unconsumed input line and consume it to output | |
172 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer | |
173 | consume_line; | |
174 | ||
175 | ## Consume the next sequential line with probability p_seq. | |
176 | while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do | |
177 | consume_line; | |
178 | done; | |
179 | ||
180 | ((n_loop1++)); # increment jump counter | |
181 | done; | |
182 | #yell "DEBUG:n_loop1:$n_loop1"; # count jumps | |
183 | ||
184 | }; # main program | |
185 | ||
186 | main "$@"; | |
187 | ||
188 | # Author: Steven Baltakatei Sandoval | |
189 | # License: GPLv3+ | |
190 | ||
191 | # Dependency information | |
192 | ||
193 | # bc 1.07.1 | |
194 | # Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc. | |
195 | ||
196 | ||
197 | # shuf (GNU coreutils) 8.32 | |
198 | # Copyright (C) 2020 Free Software Foundation, Inc. | |
199 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. | |
200 | # This is free software: you are free to change and redistribute it. | |
201 | # There is NO WARRANTY, to the extent permitted by law. | |
202 | ||
203 | # Written by Paul Eggert. | |
204 | ||
205 |