#!/bin/bash
# Desc: Output random text selection from a text file within a directory
# Usage: randtxt.sh DIR
# Version 0.0.6
# Depends: Bash 5.1.16, GNU findutils 4.8.0, GNU Coreutils 8.32
# Example: randtxt.sh ~/Calibre\ Library/

SAMPLE=10000;   # ceiling for number of text files to consider
CONTEXT="5000"; # total bytes +1 of text to display
if [[ $(( CONTEXT/2 + CONTEXT/2 )) -lt $CONTEXT ]]; then ((CONTEXT++)); fi;


yell() { echo "$0: $*" >&2; } # print script path and all args to stderr
die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status
must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails
showUsage() {
    cat <<'EOF'
    USAGE:
        randtxt.sh DIR

    EXAMPLE:
      randtxt.sh ~/
EOF
} # Display information on how to use this script.
checkInput() {
    # input:  arg1
    if [[ $# -gt 1 ]]; then die "FATAL:Too many arguments"; fi;
    if [[ $# -lt 1 ]]; then die "FATAL:Not enough arguments"; fi;
    if [[ ! -d "$1" ]]; then die "FATAL:Not a dir:$1"; else return 0; fi;
};
getFileSizeList() {
    # Desc: Create list of file sizes and paths
    # input:  arg1   directory path
    # output: array  fileSizeList
    #         var    totalSize
    #         var    randPoint
    
    declare -g -a fileSizeList;
    mapfile -d '' -t fileSizeList < <(
        find -- "$1" -type f -name "*.txt" -printf '%s\t%p\0' | shuf -z -n"$SAMPLE";
    ); # Build array by feeding null-delimited lines from `find` to `mapfile`

    declare -g totalSize=0;
    local i size;
    for i in "${!fileSizeList[@]}"; do
        size="${fileSizeList[i]%%$'\t'*}";
        totalSize=$((totalSize + size));
    done;
    if [[ $totalSize -le 0 ]]; then die "FATAL:Total size is zero."; fi;
    
    declare -g randPoint
    randPoint="$(shuf -n1 -i0-$((totalSize-1)); )";
};
getRandText() {
    # Desc: Print text within fileSizeList around randPoint
    # input: array  fileSizeList
    #        var    totalSize
    #        var    randPoint
    
    local sum psum i size fileStart filePoint fileEnd selStart selEnd selCount file;
    sum=0; # init byte sum
    psum=0;
    for i in "${!fileSizeList[@]}"; do
        # Move search to next file
        size="${fileSizeList[i]%%$'\t'*}";
        sum=$(( sum + size ));

        # Check if point is within file
        if [[ $sum -gt $randPoint ]]; then
            # Calculate important positions within file (zero-indexed)
            fileStart=0;                              # first byte within file
            filePoint="$(( randPoint - psum ))";      # point as byte within file
            fileEnd="$(( size - 1 ))";                # last byte within file
            selStart="$(( filePoint - CONTEXT/2 ))";  # start of output selection
            selEnd="$(( filePoint + CONTEXT/2 ))";    # end of output selection
            # Clamp selection start and end indexes
            if [[ $selStart -lt $fileStart ]]; then selStart=$fileStart; fi;
            if [[ $selEnd -gt $fileEnd ]]; then selEnd=$fileEnd; fi;
            selCount="$(( selEnd - selStart + 1 ))";  # number of bytes within selection
            # Output context
            file="$(cut -f2- <<< "${fileSizeList[i]}"; )";
            printf "INFO:Sample of:%s\n" "$file" 1>&2;
            tail --bytes=+$((selStart+1)) -- "$file" | head --bytes=$((selCount));
            printf "\n";
            return 0;
        fi;
        psum=$sum; # store previous sum
    done;
};
export -f checkInput getFileSizeList getRandText;

main() {
    # Input:  arg1    directory path
    # Output: stdout  text

    checkInput "$@";      # check input arguments
    getFileSizeList "$1"; # make fileSizeList array
    getRandText;          # output text
    return 0;
};

main "$@" && exit 0;
