| 1 | #!/bin/bash |
| 2 | # Desc: Convert wikicode to subpages |
| 3 | # Usage: mw_wc2sp.sh [path file] |
| 4 | # Input: arg1 path input wikicode file |
| 5 | # Output: files wikicode file tree |
| 6 | # Depends: Bash 5.1.16, GNU Coreutils 8.32 |
| 7 | # Version: 0.2.0 |
| 8 | |
| 9 | re_sp='^(<!-- @subpage:)(.*)([ ]*-->)$'; # subpage marker pattern |
| 10 | d_out=./wikicode/; # default output dir |
| 11 | f_spl="subpage_list.txt"; # subpage title list |
| 12 | p_spl="${d_out}/subpages/${f_spl}"; |
| 13 | f_splv="subpage_list_validated.txt"; # subpage title list (validated) |
| 14 | p_splv="${d_out}/subpages/${f_splv}"; |
| 15 | f_splwc="subpages.wc"; # subpage list wikicode |
| 16 | p_splwc="${d_out}/${f_splwc}"; |
| 17 | |
| 18 | yell() { echo "$0: $*" >&2; } # print script path and all args to stderr |
| 19 | die() { yell "$*"; exit 111; } # same as yell() but non-zero exit status |
| 20 | must() { "$@" || die "cannot $*"; } # runs args as command, reports args if command fails |
| 21 | get_path_fork_level() { |
| 22 | # Desc: Get fork level from two paths |
| 23 | # Input: arg1 str path |
| 24 | # arg2 str path |
| 25 | # Output: stdout int fork level |
| 26 | # Version: 0.0.1 |
| 27 | local path1="$1"; |
| 28 | local path2="$2"; |
| 29 | |
| 30 | # Squeeze multiple slashes and remove trailing slashes |
| 31 | path1="$(echo "$path1" | tr -s '/' | sed 's:/*$::' )"; |
| 32 | path2="$(echo "$path2" | tr -s '/' | sed 's:/*$::' )"; |
| 33 | |
| 34 | # Check for mixed absolute/relative paths |
| 35 | if [[ "$path1" =~ ^/ ]] && [[ "$path2" =~ ^/ ]]; then |
| 36 | flag_root=true; |
| 37 | # Remove initial / |
| 38 | path1="$(echo "$path1" | sed -e 's:^/::' )"; |
| 39 | path2="$(echo "$path2" | sed -e 's:^/::' )"; |
| 40 | elif [[ ! "$path1" =~ ^/ ]] && [[ ! "$path2" =~ ^/ ]]; then |
| 41 | flag_root=false; |
| 42 | else |
| 43 | declare -p path1 path2 flag_root; |
| 44 | echo "FATAL:Mixed relative and absolute paths not supported." 1>&2; |
| 45 | return 1; |
| 46 | fi; |
| 47 | |
| 48 | # Save path as arrays with `/` as element delimiter |
| 49 | local IFS='/'; |
| 50 | read -ra parts1 <<< "$path1"; |
| 51 | read -ra parts2 <<< "$path2"; |
| 52 | |
| 53 | # Get fork level by counting identical path elements from rootside |
| 54 | local fork_level=0; |
| 55 | for (( i=0; i<${#parts1[@]} && i<${#parts2[@]}; i++ )); do |
| 56 | if [[ "${parts1[i]}" != "${parts2[i]}" ]]; then break; fi; |
| 57 | ((fork_level++)); |
| 58 | done; |
| 59 | |
| 60 | echo "$fork_level"; |
| 61 | #declare -p path1 path2 flag_root parts1 parts2 fork_level; # debug |
| 62 | return 0; |
| 63 | }; # Get fork level int from two paths |
| 64 | prune_path_rootside() { |
| 65 | # Desc: Prunes a path from the root-side to a specified prune level. |
| 66 | # Input: arg1 str path |
| 67 | # arg2 int prune level (0-indexed) |
| 68 | # Depends: GNU sed 4.8 |
| 69 | # Version: 0.0.1 |
| 70 | local path="$1"; |
| 71 | local prune_level="$2"; |
| 72 | |
| 73 | # Check for absolute or relative path |
| 74 | if [[ "$path" =~ ^/ ]]; then |
| 75 | flag_root=true; |
| 76 | # Remove initial / |
| 77 | path="$(echo "$path" | sed -e 's:^/::' )"; |
| 78 | else |
| 79 | flag_root=false; |
| 80 | fi; |
| 81 | |
| 82 | # Save path as array with `/` as element delimiter |
| 83 | local IFS='/'; |
| 84 | read -ra parts <<< "$path"; |
| 85 | |
| 86 | # Assemble pruned path from prune_level |
| 87 | local pruned_path=""; |
| 88 | for (( i=prune_level; i<${#parts[@]}; i++ )); do |
| 89 | pruned_path+="${parts[i]}/"; |
| 90 | done; |
| 91 | |
| 92 | # Trim trailing `/` delimiter |
| 93 | pruned_path=$(echo "$pruned_path" | sed 's:/*$::'); |
| 94 | |
| 95 | # Restore initial / if appropriate |
| 96 | if [[ "$flag_root" == "true" ]] && [[ "$prune_level" -eq 0 ]]; then |
| 97 | pruned_path=/"$pruned_path"; |
| 98 | fi; |
| 99 | |
| 100 | # Output pruned path |
| 101 | echo "$pruned_path"; |
| 102 | #declare -p path prune_level parts pruned_path && printf "========\n"; # debug |
| 103 | return 0; |
| 104 | }; # prune path rootside to int specified level |
| 105 | get_path_hierarchy_level() { |
| 106 | # Desc: Outputs hierarchy level of input paths |
| 107 | # Example: $ cat lines.txt | get_path_hierarchy_level |
| 108 | # Input: stdin str lines with /-delimited paths |
| 109 | # Output: stdout int hierarchy level of each path |
| 110 | # Version: 0.0.1 |
| 111 | |
| 112 | local line level; |
| 113 | local flag_root; |
| 114 | local -a output; |
| 115 | |
| 116 | n=0; |
| 117 | while read -r line; do |
| 118 | # Check for mixed absolute/relative paths. |
| 119 | if [[ $n -le 0 ]] && [[ "$line" =~ ^/ ]]; then |
| 120 | flag_root=true; |
| 121 | else |
| 122 | flag_root=false; |
| 123 | fi; |
| 124 | if { [[ "$flag_root" == "true" ]] && [[ ! "$line" =~ ^/ ]]; } || \ |
| 125 | { [[ "$flag_root" == "false" ]] && [[ "$line" =~ ^/ ]]; } then |
| 126 | echo "FATAL:Mixed relative and absolute paths not supported." 1>&2; return 1; |
| 127 | fi; |
| 128 | |
| 129 | # Squeeze multiple slashes and remove trailing slashes |
| 130 | line="$(echo "$line" | tr -s '/' | sed 's:/*$::' )"; |
| 131 | |
| 132 | # Count the number of slashes to determine hierarchy level |
| 133 | level="$(echo "$line" | awk -F'/' '{print NF-1}' )"; |
| 134 | if [[ "$flag_root" == "true" ]]; then ((level--)); fi; |
| 135 | |
| 136 | # Append to output |
| 137 | output+=("$level"); |
| 138 | #declare -p flag_root level; # debug |
| 139 | ((n++)); |
| 140 | done; |
| 141 | # Print output |
| 142 | printf "%s\n" "${output[@]}"; |
| 143 | }; # return hierarchy level of lines as integers |
| 144 | validate_subpage_list() { |
| 145 | # Desc: Check for illegal characters in subpage titles |
| 146 | # Input: stdin unvalidated subpage list |
| 147 | # Output: stdout validated subpage list |
| 148 | # Depends: BK-2020-03 read_stdin(), yell(), die() |
| 149 | # GNU sed v4.8 |
| 150 | while read -r line; do |
| 151 | |
| 152 | # Reject chars illegal in Mediawiki page titles. |
| 153 | re_illegal='[][><|}{#_]'; # match illegal page names chars #, <, >, [, ], _, {, |, } |
| 154 | if [[ "$line" =~ $re_illegal ]]; then |
| 155 | die "FATAL:Illegal char. Not allowed: #, <, >, [, ], _, {, |, }:$line"; |
| 156 | fi; |
| 157 | |
| 158 | # Reject trailing spaces. |
| 159 | re_ts=' $'; # match trailing space |
| 160 | if [[ "$line" =~ $re_ts ]]; then |
| 161 | die "FATAL:Trailing spaces not allowed:$line"; |
| 162 | fi; |
| 163 | |
| 164 | # Replace some chars with HTML-style codes |
| 165 | ## replace ampersand & with & # must be first |
| 166 | ## replace double quote " with " |
| 167 | ## replace single quote ' with ' |
| 168 | line="$(sed \ |
| 169 | -e 's/&/\&/g' \ |
| 170 | -e 's/"/\"/g' \ |
| 171 | -e "s/'/\'/g" \ |
| 172 | <<< "$line" )" || { echo "FATAL:Error running sed."; }; |
| 173 | printf "%s\n" "$line"; |
| 174 | done || { |
| 175 | echo "FATAL:Error reading stdin." 1>&2; return 1; }; |
| 176 | }; |
| 177 | check_input() { |
| 178 | local path_in="$1"; |
| 179 | if [[ ! -f "$path_in" ]]; then die "FATAL:Not a file path:$1"; fi; |
| 180 | }; # check input |
| 181 | assemble_subpage_ftree() { |
| 182 | # Desc: Identify subpage markers in input wikicode file to create |
| 183 | # subpage list and subpage content files |
| 184 | # Input: var fp_in path input file |
| 185 | # var re_sp regex for identifying subpage markers |
| 186 | # var d_out path directory for output |
| 187 | # var p_spl path subpage list file |
| 188 | #declare -p re_sp d_out f_spl p_spl fp_in; # debug |
| 189 | |
| 190 | yell "STATUS:Running assemble_subpage_ftree()."; # debug |
| 191 | |
| 192 | spc_path="${d_out}/subpages/presubpage.content"; # default destination for content before subpage detected |
| 193 | |
| 194 | ## Process input line-by-line |
| 195 | while read -r line; do |
| 196 | #declare -p line re_sp; # debug |
| 197 | ### Check for subpage marker |
| 198 | if [[ "$line" =~ $re_sp ]]; then |
| 199 | #### Identify new subpage path |
| 200 | sp_path="$(echo "$line" | sed -E -e "s/${re_sp}/\2/" -e 's/[ ]*$//'; )"; |
| 201 | # declare -p sp_path; # debug |
| 202 | #### Update subpage content file path |
| 203 | spc_path="${d_out}/subpages/${sp_path}.content"; |
| 204 | spc_dir="$(dirname "$spc_path"; )"; |
| 205 | #declare -p spc_path spc_dir; # debug |
| 206 | #### Prepare file destination |
| 207 | if [[ ! -d "$spc_dir" ]]; then |
| 208 | must mkdir -p "$spc_dir" && \ |
| 209 | yell "STATUS:Created dir:${spc_dir}"; |
| 210 | fi; |
| 211 | if [[ -f "$spc_path" ]]; then |
| 212 | die "FATAL:File already exists:${spc_path}"; |
| 213 | else |
| 214 | must touch "$spc_path"; |
| 215 | fi; |
| 216 | #### Append subpage path to subpage list |
| 217 | printf "%s\n" "$sp_path" >> "$p_spl"; |
| 218 | fi; |
| 219 | ### Write subpage content |
| 220 | must printf "%s\n" "$line" >> "$spc_path"; |
| 221 | done < "${fp_in}"; |
| 222 | |
| 223 | yell "STATUS:Finished assemble_subpage_ftree()."; # debug |
| 224 | }; # process input wikicode into subpage content files and subpage list |
| 225 | print_wc_content() { |
| 226 | # Input: var p_spc path subpage content |
| 227 | # Output: stdout |
| 228 | printf -- "\n----<onlyinclude>\n"; |
| 229 | cat "$p_spc"; |
| 230 | printf -- "\n</onlyinclude>----\n"; |
| 231 | }; # print wikicode content |
| 232 | print_wc_footer() { |
| 233 | # Output: stdout |
| 234 | printf -- "\n==References==\n<references />\n" |
| 235 | printf -- "\n==Footnotes==\n<references group=fn />\n"; |
| 236 | printf -- "\n==Comments==\n<references group=cmt />\n"; |
| 237 | printf -- "\n"; |
| 238 | }; # print wikicode footer |
| 239 | create_output_wikicode() { |
| 240 | # Desc: Use subpage list and subpage content files to create |
| 241 | # output subpage wikicode. |
| 242 | # Input: var p_spl path subpage list file |
| 243 | # var p_splv path subpage list file (validated) |
| 244 | # file ${p_spl} subpage list file |
| 245 | # file ${p_splv} subpage list file (validated) |
| 246 | # var d_out path directory for output |
| 247 | # Depends: get_path_fork_level() |
| 248 | # prune_path_rootside() |
| 249 | # get_path_hierarchy_level() |
| 250 | # validate_subpage_list() |
| 251 | # Output: files subpages in $d_out |
| 252 | |
| 253 | yell "Running create_output_wikicode()."; # debug |
| 254 | |
| 255 | # Read subpage list files into arrays. |
| 256 | local -a lines_spl lines_splv; |
| 257 | mapfile -t lines_spl < "$p_spl"; |
| 258 | mapfile -t lines_splv < "$p_splv"; |
| 259 | ## Add extra blank lines for couple line comparisons |
| 260 | lines_spl+=(''); |
| 261 | lines_splv+=(''); |
| 262 | declare -p lines_spl; # debug |
| 263 | |
| 264 | # Check that subpage list files have same line counts |
| 265 | lc_spl="${#lines_spl[@]}"; |
| 266 | lc_splv="${#lines_splv[@]}"; |
| 267 | if [[ ! "$lc_spl" -eq "$lc_splv" ]]; then |
| 268 | die "FATAL:Different line counts for subpage lists:$(declare -p lc_spl lc_splv;)"; |
| 269 | fi; |
| 270 | declare -p lc_spl lc_splv; # debug |
| 271 | |
| 272 | # Initialize subpage list wikicode file |
| 273 | must touch "$p_splwc"; |
| 274 | printf "==Stats==\n\n==Subpages==\n" >> "$p_splwc"; |
| 275 | |
| 276 | # Read content files according to subpage list file |
| 277 | # Note: $i corresponds to “next” line ($lnext). Therefore, use |
| 278 | # $((i-1)) to access the “current” ($lcurr) line. This offset is |
| 279 | # because subpage list lines are compared using lagging line |
| 280 | # comparison. |
| 281 | for i in "${!lines_spl[@]}"; do |
| 282 | declare -p i; # debug; |
| 283 | |
| 284 | # Check subpage content files |
| 285 | f_spc="${lines_spl[i-1]}.content"; |
| 286 | p_spc="${d_out}/subpages/${f_spc}"; |
| 287 | declare -p f_spc p_spc; |
| 288 | ## Exit if subpage content file missing |
| 289 | if [[ "$i" -gt 0 ]] && [[ ! -f "$p_spc" ]]; then |
| 290 | die "FATAL:Subpage content file missing:$p_spc"; fi; |
| 291 | |
| 292 | # Prepare output subpage wikicode files |
| 293 | f_spwc="${lines_splv[i-1]}.wc"; |
| 294 | p_spwc="${d_out}/subpages/${f_spwc}"; # use validated subpage name |
| 295 | declare -p f_spwc p_spwc; # debug |
| 296 | if [[ "$i" -gt 0 ]]; then |
| 297 | must touch "$p_spwc"; |
| 298 | ## Append subpage list wikicode file |
| 299 | printf "* [[/%s]]\n" "${lines_splv[i-1]}" >> "$p_splwc"; |
| 300 | fi; |
| 301 | |
| 302 | # Advance input lines |
| 303 | lprev="$lcurr"; |
| 304 | lcurr="$lnext"; |
| 305 | lnext="${lines_splv[i]}"; |
| 306 | declare -p lprev lcurr lnext; # debug |
| 307 | |
| 308 | # Update hierarchy tracker states |
| 309 | lprev_hier="$lcurr_hier"; |
| 310 | lcurr_hier="$lnext_hier"; |
| 311 | lnext_hier="$(echo "$lnext" | get_path_hierarchy_level)"; |
| 312 | |
| 313 | # Skip first iteration |
| 314 | if [[ "$i" -eq 0 ]]; then |
| 315 | yell "$i:DEBUG:Skipping first iteration."; # debug |
| 316 | printf -- "----\n" 1>&2; # debug |
| 317 | continue; fi; |
| 318 | |
| 319 | # Get path fork levels |
| 320 | fork_level_next="$(get_path_fork_level "$lcurr" "$lnext")"; |
| 321 | fork_level_prev="$(get_path_fork_level "$lcurr" "$lprev")"; |
| 322 | |
| 323 | # Count relative ups needed (`../`) |
| 324 | relups_next="$((lcurr_hier - fork_level_next + 1))"; |
| 325 | relups_prev="$((lcurr_hier - fork_level_prev + 1))"; |
| 326 | |
| 327 | # Initialize Next and Prev links with relative ups to fork. |
| 328 | link_next=""; |
| 329 | for (( j=0; j<relups_next; j++ )); do link_next+="../"; done; |
| 330 | if [[ "$relups_next" -eq 0 ]]; then link_next+="/"; fi; # handle new subpage path dive |
| 331 | link_prev=""; |
| 332 | for (( j=0; j<relups_prev; j++ )); do link_prev+="../"; done; |
| 333 | |
| 334 | # Append branchs from fork to Next and Prev targets |
| 335 | link_next+="$(prune_path_rootside "$lnext" "$fork_level_next")"; |
| 336 | link_prev+="$(prune_path_rootside "$lprev" "$fork_level_prev")"; |
| 337 | |
| 338 | # Print navigation link wikicode |
| 339 | if [[ -z "$lprev" ]]; then |
| 340 | printf "[[%s|Next]], [[../|Up]]\n" "$link_next" >> "$p_spwc"; |
| 341 | elif [[ -n "$lnext" ]]; then |
| 342 | printf "[[%s|Next]], [[%s|Previous]], [[../|Up]]\n" "$link_next" "$link_prev" >> "$p_spwc"; |
| 343 | elif [[ -z "$lnext" ]]; then |
| 344 | printf "[[%s|Previous]], [[../|Up]]\n" "$link_prev" >> "$p_spwc"; |
| 345 | else |
| 346 | yell "FATAL:Here be dragons."; |
| 347 | fi; |
| 348 | |
| 349 | # Print subpage content |
| 350 | print_wc_content >> "$p_spwc"; |
| 351 | print_wc_footer >> "$p_spwc"; |
| 352 | |
| 353 | declare -p i lprev lcurr lnext lprev_hier lcurr_hier lnext_hier; # debug |
| 354 | declare -p fork_level_next fork_level_prev relups_next relups_prev; # debug |
| 355 | declare -p link_next link_prev; # debug |
| 356 | printf "====================\n" # debug |
| 357 | done; |
| 358 | |
| 359 | # Add footer to subpage list wikicode file |
| 360 | print_wc_footer >> "$p_splwc"; |
| 361 | |
| 362 | yell "STATUS:Finished create_output_wikicode()."; # debug |
| 363 | }; # generate output subpage wikicode |
| 364 | main() { |
| 365 | check_input "$@"; |
| 366 | declare -g fp_in="$1"; # input file path |
| 367 | assemble_subpage_ftree; |
| 368 | validate_subpage_list < "$p_spl" > "$p_splv"; |
| 369 | create_output_wikicode; |
| 370 | }; # main program |
| 371 | |
| 372 | main "$@"; |
| 373 | |
| 374 | # Author: Steven Baltakatei Sandoval |
| 375 | # License: GPLv3+ |
| 376 | |
| 377 | |
| 378 | # Example input: |
| 379 | # ``` |
| 380 | # <!-- @subpage:Introduction --> |
| 381 | # This is an introducton. |
| 382 | # <!-- @subpage:Foreword --> |
| 383 | # This is a foreword. |
| 384 | |
| 385 | # <!-- @subpage:Part 1/Chapter 1 --> |
| 386 | # Blah. |
| 387 | # <!-- @subpage:Part 1/Chapter 2 --> |
| 388 | # Blah. |
| 389 | # <!-- @subpage:Part 1/Chapter 2/Section A --> |
| 390 | # Blabbity blah. |
| 391 | # <!-- @subpage:Part 2/ --> |
| 392 | # Blah. |
| 393 | # <!-- @subpage:Part 2/Chapter 1 --> |
| 394 | # More blah. |
| 395 | # ``` |