Commit | Line | Data |
---|---|---|
22772fe8 SBS |
1 | #!/usr/bin/env bash |
2 | # Desc: Mixes input lines while also preserving some neighbors | |
3 | # Usage: cat file | bkshuf arg1 | |
48dab430 | 4 | # Version 0.1.1 |
22772fe8 SBS |
5 | # Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf) |
6 | # Input: var: arg1 initial lines to output | |
7 | ||
b9e8b771 SBS |
8 | # Load env vars |
9 | ## For these numbers of lines of input... | |
10 | if [[ ! -v BKSHUF_PARAM_LINEC ]]; then BKSHUF_PARAM_LINEC=1000000; fi; | |
11 | ## ... target this group size. | |
12 | if [[ ! -v BKSHUF_PARAM_GSIZE ]]; then BKSHUF_PARAM_GSIZE=25; fi; | |
22772fe8 SBS |
13 | |
14 | ||
15 | yell() { echo "$0: $*" >&2; } # print script path and all args to stderr | |
16 | die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status | |
17 | must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails | |
18 | read_stdin() { | |
19 | # Desc: Consumes stdin; outputs as stdout lines | |
20 | # Input: stdin (consumes) | |
21 | # Output: stdout (newline delimited) | |
22 | # Example: printf "foo\nbar\n" | read_stdin | |
23 | # Depends: GNU bash (version 5.1.16) | |
24 | # Version: 0.0.1 | |
25 | local input_stdin output; | |
26 | ||
27 | # Store stdin | |
28 | if [[ -p /dev/stdin ]]; then | |
29 | input_stdin="$(cat -)"; | |
30 | fi; | |
31 | ||
32 | # Store as output array elements | |
33 | ## Read in stdin | |
34 | if [[ -n $input_stdin ]]; then | |
35 | while read -r line; do | |
36 | output+=("$line"); | |
37 | done < <(printf "%s\n" "$input_stdin"); | |
38 | fi; | |
39 | ||
40 | # Print to stdout | |
41 | printf "%s\n" "${output[@]}"; | |
42 | }; # read stdin to stdout lines | |
43 | checkInt() { | |
44 | # Desc: Checks if arg is integer | |
45 | # Usage: checkInt arg | |
46 | # Input: arg: integer | |
47 | # Output: - return code 0 (if arg is integer) | |
48 | # - return code 1 (if arg is not integer) | |
49 | # Example: if ! checkInt $arg; then echo "not int"; fi; | |
50 | # Version: 0.0.1 | |
51 | local returnState | |
52 | ||
53 | #===Process Arg=== | |
54 | if [[ $# -ne 1 ]]; then | |
55 | die "ERROR:Invalid number of arguments:$#"; | |
56 | fi; | |
57 | ||
58 | RETEST1='^[0-9]+$'; # Regular Expression to test | |
59 | if [[ ! $1 =~ $RETEST1 ]] ; then | |
60 | returnState="false"; | |
61 | else | |
62 | returnState="true"; | |
63 | fi; | |
64 | ||
65 | #===Determine function return code=== | |
66 | if [ "$returnState" = "true" ]; then | |
67 | return 0; | |
68 | else | |
69 | return 1; | |
70 | fi; | |
71 | } # Checks if arg is integer | |
72 | consume_line() { | |
73 | # Desc: Outputs and destroys line from list_input starting at index ip | |
74 | # Usage: consume_line; | |
75 | # Input: var: list_input array input lines | |
76 | # var: lco int line count original | |
77 | # var: lcr int line count remaining | |
78 | # var: ip int list_input index pointer | |
79 | # Output: stdout: a single non-blank element from list_input | |
80 | # list_input one element destroyed | |
81 | # var: lc_out incremented once | |
82 | local n line; | |
83 | ||
84 | n=0; # for tracking progress iterating through remaining list_input | |
85 | ### Loop once through list_input indices until a non-blank line found | |
86 | while [[ $n -le $lco ]]; do | |
87 | #### check if line at ip is blank | |
88 | line="${list_input[$ip]}"; | |
89 | if [[ -n $line ]]; then | |
90 | ##### consume line at index ip | |
91 | printf "%s\n" "$line" & # print to output | |
92 | unset "list_input[$ip]"; # destroy line in list_input array | |
93 | ((lc_out++)); | |
94 | ((lcr--)); # decrement line count remaining lcr | |
95 | #yell "DEBUG:Consumed line ip:$ip:$line"; | |
96 | break; | |
97 | fi; | |
98 | #### increment input index pointer, looping around if necessary | |
99 | if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi; | |
100 | #### track progress through list_input | |
101 | ((n++)); | |
102 | done; | |
103 | }; # consume and output line in list_input array starting at index ip | |
104 | decide_read() { | |
105 | # Desc: Decides whether to read another line in list_input array | |
106 | # by comparing $RANDOM to p_seq_int | |
107 | # Usage: if decide_read; then something; fi; | |
108 | # Input: var: p_seq_int probability (int [0 32767]) | |
109 | ||
110 | if [[ $RANDOM -lt $p_seq_int ]]; then | |
111 | return 0; | |
112 | else | |
113 | return 1; | |
114 | fi; | |
115 | }; # returns 0 with probability p_seq; 1 otherwise | |
116 | ||
117 | main() { | |
118 | declare par_l0 par_s0 s_exp s; | |
119 | declare -a list_input; | |
120 | ||
121 | # Check positional arguments | |
122 | if [[ $# -gt 0 ]] && ! checkInt "$1"; then | |
123 | die "FATAL:Not an integer:$1"; | |
124 | else | |
125 | lc_out_max="$1"; # output line count | |
126 | fi; | |
48dab430 SBS |
127 | |
128 | # Check env vars | |
129 | if ! checkInt "$BKSHUF_PARAM_LINEC"; then | |
130 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_LINEC"; fi; | |
131 | if ! checkInt "$BKSHUF_PARAM_GSIZE"; then | |
132 | die "FATAL:Not an int:BKSHUF_PARAM_LINEC:$BKSHUF_PARAM_GSIZE"; fi; | |
22772fe8 SBS |
133 | |
134 | # store input lines from stdin (like `shuf`) | |
135 | while read -r line; do | |
136 | if [[ -z $line ]]; then continue; fi; # skip blank lines | |
137 | #yell "DEBUG:INPUT:$line"; | |
138 | list_input+=("$line"); | |
139 | done < <( read_stdin; ); | |
140 | ||
141 | # calc line count (lc) | |
142 | lc="${#list_input[@]}"; | |
143 | #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )"; | |
144 | #yell "DEBUG:lc:$lc"; | |
145 | #yell "DEBUG:list_input:$(declare -p list_input)"; | |
146 | ||
147 | # calculate group size s | |
148 | par_l0="$BKSHUF_PARAM_LINEC"; | |
149 | par_s0="$BKSHUF_PARAM_GSIZE"; | |
150 | s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1"; | |
151 | s="$(echo "scale=12; $s_exp" | bc -l)"; | |
152 | ||
153 | # calculate probabilities p_jump, p_seq | |
154 | p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)"; | |
155 | p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)"; | |
156 | p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range | |
157 | ||
158 | # generate output | |
159 | lco="${#list_input[@]}"; # save original input line count | |
160 | lcr="$lco"; | |
161 | lc_out="0"; # init output line counter | |
162 | if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi; | |
163 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer | |
164 | n_loop1="0"; | |
165 | #yell "DEBUG:max_blanks:$max_blanks" | |
166 | while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do | |
167 | ## Select random unconsumed input line and consume it to output | |
168 | ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer | |
169 | consume_line; | |
170 | ||
171 | ## Consume the next sequential line with probability p_seq. | |
172 | while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do | |
173 | consume_line; | |
174 | done; | |
175 | ||
176 | ((n_loop1++)); # increment jump counter | |
177 | done; | |
178 | #yell "DEBUG:n_loop1:$n_loop1"; # count jumps | |
179 | ||
180 | }; # main program | |
181 | ||
182 | main "$@"; | |
183 | ||
184 | # Author: Steven Baltakatei Sandoval | |
185 | # License: GPLv3+ | |
186 | ||
187 | # Dependency information | |
188 | ||
189 | # bc 1.07.1 | |
190 | # Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc. | |
191 | ||
192 | ||
193 | # shuf (GNU coreutils) 8.32 | |
194 | # Copyright (C) 2020 Free Software Foundation, Inc. | |
195 | # License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>. | |
196 | # This is free software: you are free to change and redistribute it. | |
197 | # There is NO WARRANTY, to the extent permitted by law. | |
198 | ||
199 | # Written by Paul Eggert. | |
200 | ||
201 |