From 22772fe83963783e1aa3686d277b8da051076e8e Mon Sep 17 00:00:00 2001 From: Steven Baltakatei Sandoval Date: Tue, 14 Feb 2023 11:45:28 +0000 Subject: [PATCH] feat(unitproc/bkshuf):Add bash script like shuf - Note: bkshuf tends to preserve neighbors in output --- unitproc/bkshuf | 193 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 unitproc/bkshuf diff --git a/unitproc/bkshuf b/unitproc/bkshuf new file mode 100644 index 0000000..4bf9f99 --- /dev/null +++ b/unitproc/bkshuf @@ -0,0 +1,193 @@ +#!/usr/bin/env bash +# Desc: Mixes input lines while also preserving some neighbors +# Usage: cat file | bkshuf arg1 +# Version 0.0.1 +# Depends: bc 1.07.1, GNU Coreutils 8.32 (shuf) +# Input: var: arg1 initial lines to output + +BKSHUF_PARAM_LINEC=1000000; +BKSHUF_PARAM_GSIZE=25 # lines per group for BKSHUF_PARAM_LINEC lines of input + + +yell() { echo "$0: $*" >&2; } # print script path and all args to stderr +die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status +must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails +read_stdin() { + # Desc: Consumes stdin; outputs as stdout lines + # Input: stdin (consumes) + # Output: stdout (newline delimited) + # Example: printf "foo\nbar\n" | read_stdin + # Depends: GNU bash (version 5.1.16) + # Version: 0.0.1 + local input_stdin output; + + # Store stdin + if [[ -p /dev/stdin ]]; then + input_stdin="$(cat -)"; + fi; + + # Store as output array elements + ## Read in stdin + if [[ -n $input_stdin ]]; then + while read -r line; do + output+=("$line"); + done < <(printf "%s\n" "$input_stdin"); + fi; + + # Print to stdout + printf "%s\n" "${output[@]}"; +}; # read stdin to stdout lines +checkInt() { + # Desc: Checks if arg is integer + # Usage: checkInt arg + # Input: arg: integer + # Output: - return code 0 (if arg is integer) + # - return code 1 (if arg is not integer) + # Example: if ! checkInt $arg; then echo "not int"; fi; + # Version: 0.0.1 + local returnState + + #===Process Arg=== + if [[ $# -ne 1 ]]; then + die "ERROR:Invalid number of arguments:$#"; + fi; + + RETEST1='^[0-9]+$'; # Regular Expression to test + if [[ ! $1 =~ $RETEST1 ]] ; then + returnState="false"; + else + returnState="true"; + fi; + + #===Determine function return code=== + if [ "$returnState" = "true" ]; then + return 0; + else + return 1; + fi; +} # Checks if arg is integer +consume_line() { + # Desc: Outputs and destroys line from list_input starting at index ip + # Usage: consume_line; + # Input: var: list_input array input lines + # var: lco int line count original + # var: lcr int line count remaining + # var: ip int list_input index pointer + # Output: stdout: a single non-blank element from list_input + # list_input one element destroyed + # var: lc_out incremented once + local n line; + + n=0; # for tracking progress iterating through remaining list_input + ### Loop once through list_input indices until a non-blank line found + while [[ $n -le $lco ]]; do + #### check if line at ip is blank + line="${list_input[$ip]}"; + if [[ -n $line ]]; then + ##### consume line at index ip + printf "%s\n" "$line" & # print to output + unset "list_input[$ip]"; # destroy line in list_input array + ((lc_out++)); + ((lcr--)); # decrement line count remaining lcr + #yell "DEBUG:Consumed line ip:$ip:$line"; + break; + fi; + #### increment input index pointer, looping around if necessary + if [[ $ip -le $lco ]]; then ((ip++)); else ip=0; fi; + #### track progress through list_input + ((n++)); + done; +}; # consume and output line in list_input array starting at index ip +decide_read() { + # Desc: Decides whether to read another line in list_input array + # by comparing $RANDOM to p_seq_int + # Usage: if decide_read; then something; fi; + # Input: var: p_seq_int probability (int [0 32767]) + + if [[ $RANDOM -lt $p_seq_int ]]; then + return 0; + else + return 1; + fi; +}; # returns 0 with probability p_seq; 1 otherwise + +main() { + declare par_l0 par_s0 s_exp s; + declare -a list_input; + + # Check positional arguments + if [[ $# -gt 0 ]] && ! checkInt "$1"; then + die "FATAL:Not an integer:$1"; + else + lc_out_max="$1"; # output line count + fi; + + + # store input lines from stdin (like `shuf`) + while read -r line; do + if [[ -z $line ]]; then continue; fi; # skip blank lines + #yell "DEBUG:INPUT:$line"; + list_input+=("$line"); + done < <( read_stdin; ); + + # calc line count (lc) + lc="${#list_input[@]}"; + #lc="$(printf "%s\n" "${list_input[@]}" | wc -l )"; + #yell "DEBUG:lc:$lc"; + #yell "DEBUG:list_input:$(declare -p list_input)"; + + # calculate group size s + par_l0="$BKSHUF_PARAM_LINEC"; + par_s0="$BKSHUF_PARAM_GSIZE"; + s_exp="(( $par_s0 - 1 )/( ( l( $par_l0 ) )^2 ))*(l( $lc ))^2+1"; + s="$(echo "scale=12; $s_exp" | bc -l)"; + + # calculate probabilities p_jump, p_seq + p_jump="$(echo "scale=12; 1 / ( $s )" | bc -l)"; + p_seq="$(echo "scale=12; 1 - $p_jump" | bc -l)"; + p_seq_int="$(echo "scale=0; ($p_seq * 32767)/1" | bc -l)"; # p_seq as int [0 32767] for $RANDOM range + + # generate output + lco="${#list_input[@]}"; # save original input line count + lcr="$lco"; + lc_out="0"; # init output line counter + if [[ -z "$lc_out_max" ]]; then lc_out_max="$lco"; fi; + ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer + n_loop1="0"; + #yell "DEBUG:max_blanks:$max_blanks" + while [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do + ## Select random unconsumed input line and consume it to output + ip="$(shuf -i0-$(( lco - 1 )) -n1)"; # init input index pointer + consume_line; + + ## Consume the next sequential line with probability p_seq. + while decide_read && [[ $lcr -ge 1 ]] && [[ $lc_out -lt $lc_out_max ]]; do + consume_line; + done; + + ((n_loop1++)); # increment jump counter + done; + #yell "DEBUG:n_loop1:$n_loop1"; # count jumps + +}; # main program + +main "$@"; + +# Author: Steven Baltakatei Sandoval +# License: GPLv3+ + +# Dependency information + +# bc 1.07.1 +# Copyright 1991-1994, 1997, 1998, 2000, 2004, 2006, 2008, 2012-2017 Free Software Foundation, Inc. + + +# shuf (GNU coreutils) 8.32 +# Copyright (C) 2020 Free Software Foundation, Inc. +# License GPLv3+: GNU GPL version 3 or later . +# This is free software: you are free to change and redistribute it. +# There is NO WARRANTY, to the extent permitted by law. + +# Written by Paul Eggert. + + -- 2.30.2