Browse Source

Add labels to the arguments of regex functions
to cause warnings if they are applied without labels
(and thus possibly/likely in a wrong order)

Daniil Baturin 2 months ago
parent
commit
73807e8d93
5 changed files with 190 additions and 35 deletions
  1. 155 0
      link_widgets.ml
  2. 7 7
      src/link_widgets.ml
  3. 1 1
      src/path_options.ml
  4. 6 6
      src/plugin_api.ml
  5. 21 21
      src/regex_utils.ml

+ 155 - 0
link_widgets.ml

@@ -0,0 +1,155 @@
+(* Link target manipulation widgets *)
+
+module OH = Otoml.Helpers
+
+open Soupault_common
+
+(* By default, exclude three categories of links from target rewriting:
+     1. Links that have a URI schema (^([a-zA-Z0-9]+):), e.g. https://example.com
+     2. Links to anchors within the same page (^#), e.g. #my-anchor
+     3. Hand-made relative links (^\.), e.g. ../style.css
+     4. Protocol-relative URLs that begin with //
+ *)
+let default_exclude_regex = "^((([a-zA-Z0-9]+):)|#|\\.|//)"
+
+let link_selectors = ["a"; "link"; "img"; "script"; "audio"; "video"; "object"; "embed"]
+
+let get_target_attr elem =
+  let tag_name = Soup.name elem in
+  match tag_name with
+  | "a" -> "href"
+  | "link" -> "href"
+  | "img" -> "src"
+  | "script" -> "src"
+  | "audio" -> "src"
+  | "video" -> "src"
+  | "embed" -> "src"
+  | "object" -> "data"
+  | _ ->
+    (* Shouldn't happen *)
+    internal_error @@ Printf.sprintf
+      "a relative_links or an absolute_links widget tried to process unsupported element <%s>" tag_name
+
+let target_matches only_regex exclude_regex target =
+  match only_regex with
+  | Some r ->
+    if not (Regex_utils.Internal.matches target r)
+    then (let () = Logs.debug @@ fun m -> m "Link target \"%s\" does not match only_target_regex" target in false)
+    else true
+  | None ->
+    if (Regex_utils.Internal.matches target exclude_regex)
+    then (let () = Logs.debug @@ fun m -> m "Link target \"%s\" matches exclude_target_regex" target in false)
+    else true
+
+let relativize_link_target check_file env target =
+  let open Defaults in
+  (* Before doing any real work, check if the link target is pointing at a file that actually exists
+     at a path relative to _this_ page. If it does, the target is _already correct_
+     and doesn't need to be relativized.
+
+     If the target points at a path that doesn't exist, it _probably_ comes from a page template
+     at the top level, while the current page is deeper in the directory tree.
+
+     We are checking in the target rather than source dir for two reasons:
+       1. soupault copies static assets from the site_dir before processing page files,
+          so if an asset file exists in the site_dir, it's guaranteed to also be in the target_dir
+          when this code runs;
+       2. If a file is not in the site_dir, it doesn't mean it's not in the target_dir either.
+          It may be a dynamically generated asset created by a Lua plugin or an external script.
+   *)
+  if check_file && (Sys.file_exists (FilePath.concat env.target_dir target)) then target else
+  (* Remove the build_dir from the target path to produce a path relative to the site root. *)
+  let relative_target_dir = Regex_utils.Internal.replace env.target_dir ("^" ^ env.settings.build_dir) "" in
+  (* The assumption is that the target is valid for a page at the site root.
+     Thus, for pages in sub-directories, we need to add a "../" for every nesting level.
+   *)
+  let parent_path = Utils.split_path relative_target_dir |> List.map (fun _ -> "..") in
+  (* Strip leading slashes *)
+  let target = Regex_utils.Internal.replace target "^/+" "" in
+  (* Prepend generated double-dot path to the original target. *)
+  String.concat "/" (parent_path @ [target])
+
+let relativize elem env check_file only_regex exclude_regex =
+  let target_attr = get_target_attr elem in
+  let target = Soup.attribute target_attr elem in
+  match target with
+  | None ->
+    Logs.debug @@ fun m -> m "Ignoring a <%s> element without \"%s\" attribute" (Soup.name elem) target_attr
+  | Some target ->
+    if not (target_matches only_regex exclude_regex target)
+    then Logs.debug @@ fun m -> m "Link target \"%s\" is excluded by regex options, ignoring" target
+    else begin
+      let target = relativize_link_target check_file env target in
+      Soup.set_attribute (get_target_attr elem) target elem
+    end
+
+let absolutize elem env prefix check_file only_regex exclude_regex =
+  let open Defaults in
+  let target_attr = get_target_attr elem in
+  let target = Soup.attribute target_attr elem in
+  match target with
+  | None ->
+    Logs.debug @@ fun m -> m "Ignoring a <%s> element without \"%s\" attribute" (Soup.name elem) target_attr
+  | Some target ->
+    if not (target_matches only_regex exclude_regex target)
+    then Logs.debug @@ fun m -> m "Link target \"%s\" matches exclude_target_regex, ignoring" target
+    else begin
+      (* Remove the build_dir from the path *)
+      let relative_target_dir = Regex_utils.Internal.replace env.target_dir ("^" ^ env.settings.build_dir) "" in
+      (* Strip leading slashes *)
+      let target = Regex_utils.Internal.replace target "^/+" "" in
+      let parent_path =
+        (if check_file && (Sys.file_exists (FilePath.concat env.target_dir target))
+        then let dir_path = Utils.split_path relative_target_dir in String.concat "/" (prefix :: dir_path)
+        else prefix)
+      in
+      let target = String.concat "/" [parent_path; target] in
+      Soup.set_attribute (get_target_attr elem) target elem
+    end
+
+(** Converts all internal links to relative according to the page's location in the directory tree. *)
+let relative_links env config soup =
+  let valid_options = List.append Config.common_widget_options ["exclude_target_regex"; "only_target_regex"; "check_file"] in
+  let () = Config.check_options valid_options config "widget \"relative_links\"" in
+  let exclude_regex = OH.find_string_opt config ["exclude_target_regex"] in
+  let only_regex = OH.find_string_opt config ["only_target_regex"] in
+  if (Option.is_some exclude_regex) && (Option.is_some only_regex)
+  then Config.config_error "exclude_target_regex and only_target_regex options are mutually exclusive"
+  else begin
+    let exclude_regex = Option.value ~default:default_exclude_regex exclude_regex in
+    let check_file = Config.find_bool_or ~default:false config ["check_file"] in
+    let nodes = Html_utils.select_all link_selectors soup in begin
+    match nodes with
+    | [] ->
+      Logs.debug @@ fun m -> m "Page has no link elements that need adjustment"
+    | ns -> List.iter (fun e -> relativize e env check_file only_regex exclude_regex) ns
+    end;
+    Ok ()
+  end
+
+(** Converts all internal links to absolute. *)
+let absolute_links env config soup =
+  let (let*) = Result.bind in
+  let valid_options = List.append Config.common_widget_options
+    ["exclude_target_regex"; "only_target_regex"; "check_file"; "prefix"]
+  in
+  let () = Config.check_options valid_options config "widget \"absolute_links\"" in
+  let* prefix = Config.find_string_result config ["prefix"] in
+  (* Strip trailing slashes to avoid duplicate slashes after concatenation *)
+  let prefix = Regex_utils.Internal.replace prefix "/+$" "" in
+  let exclude_regex = OH.find_string_opt config ["exclude_target_regex"] in
+  let only_regex = OH.find_string_opt config ["only_target_regex"] in
+  if (Option.is_some exclude_regex) && (Option.is_some only_regex)
+  then Config.config_error "exclude_target_regex and only_target_regex options are mutually exclusive"
+  else begin
+    let exclude_regex = Option.value ~default:default_exclude_regex exclude_regex in
+    let check_file = Config.find_bool_or ~default:false config ["check_file"] in
+    let nodes = Html_utils.select_all link_selectors soup in begin
+    match nodes with
+    | [] ->
+      Logs.debug @@ fun m -> m "Page has no link elements that need adjustment"
+    | ns -> List.iter (fun e -> absolutize e env prefix check_file only_regex exclude_regex) ns
+    end;
+    Ok ()
+  end
+

+ 7 - 7
src/link_widgets.ml

@@ -33,11 +33,11 @@ let get_target_attr elem =
 let target_matches only_regex exclude_regex target =
   match only_regex with
   | Some r ->
-    if not (Regex_utils.Internal.matches target r)
+    if not (Regex_utils.Internal.matches ~regex:r target)
     then (let () = Logs.debug @@ fun m -> m "Link target \"%s\" does not match only_target_regex" target in false)
     else true
   | None ->
-    if (Regex_utils.Internal.matches target exclude_regex)
+    if (Regex_utils.Internal.matches ~regex:exclude_regex target)
     then (let () = Logs.debug @@ fun m -> m "Link target \"%s\" matches exclude_target_regex" target in false)
     else true
 
@@ -61,10 +61,10 @@ let relativize elem env check_file only_regex exclude_regex =
        *)
       if check_file && (Sys.file_exists (FilePath.concat env.target_dir target)) then () else
       (* Remove the build_dir from the path *)
-      let relative_target_dir = Regex_utils.Internal.replace env.target_dir ("^" ^ env.settings.build_dir) "" in
+      let relative_target_dir = Regex_utils.Internal.replace ~regex:("^" ^ env.settings.build_dir) ~sub:"" env.target_dir in
       let parent_path = Utils.split_path relative_target_dir |> List.map (fun _ -> "..") in
       (* Strip leading slashes *)
-      let target = Regex_utils.Internal.replace target "^/+" "" in
+      let target = Regex_utils.Internal.replace ~regex:"^/+" ~sub:"" target in
       let target = String.concat "/" (parent_path @ [target]) in
       Soup.set_attribute (get_target_attr elem) target elem
     end
@@ -81,9 +81,9 @@ let absolutize elem env prefix check_file only_regex exclude_regex =
     then Logs.debug @@ fun m -> m "Link target \"%s\" matches exclude_target_regex, ignoring" target
     else begin
       (* Remove the build_dir from the path *)
-      let relative_target_dir = Regex_utils.Internal.replace env.target_dir ("^" ^ env.settings.build_dir) "" in
+      let relative_target_dir = Regex_utils.Internal.replace ~regex:("^" ^ env.settings.build_dir) ~sub:"" env.target_dir in
       (* Strip leading slashes *)
-      let target = Regex_utils.Internal.replace target "^/+" "" in
+      let target = Regex_utils.Internal.replace ~regex:"^/+" ~sub:"" target in
       let parent_path =
         (if check_file && (Sys.file_exists (FilePath.concat env.target_dir target))
         then let dir_path = Utils.split_path relative_target_dir in String.concat "/" (prefix :: dir_path)
@@ -122,7 +122,7 @@ let absolute_links env config soup =
   let () = Config.check_options valid_options config "widget \"absolute_links\"" in
   let* prefix = Config.find_string_result config ["prefix"] in
   (* Strip trailing slashes to avoid duplicate slashes after concatenation *)
-  let prefix = Regex_utils.Internal.replace prefix "/+$" "" in
+  let prefix = Regex_utils.Internal.replace ~regex:"/+$" ~sub:"" prefix in
   let exclude_regex = OH.find_string_opt config ["exclude_target_regex"] in
   let only_regex = OH.find_string_opt config ["only_target_regex"] in
   if (Option.is_some exclude_regex) && (Option.is_some only_regex)

+ 1 - 1
src/path_options.ml

@@ -6,7 +6,7 @@ let page_matches site_dir actual_path conf_path =
   (=) conf_path actual_path
 
 let regex_matches actual_path path_re =
-  try Regex_utils.Raw.matches path_re actual_path
+  try Regex_utils.Raw.matches ~regex:path_re actual_path
   with Regex_utils.Bad_regex ->
     soupault_error @@ Printf.sprintf "Could not check a path regex option: malformed regex \"%s\"" path_re
 

+ 6 - 6
src/plugin_api.ml

@@ -13,31 +13,31 @@ let plugin_error err = raise (Plugin_error err)
 module Re_wrapper = struct
   let replace ?(all=false) s pat sub =
     try
-      Regex_utils.Raw.replace ~all:all pat s sub
+      Regex_utils.Raw.replace ~all:all ~regex:pat ~sub:sub s
     with Regex_utils.Bad_regex ->
       plugin_error @@ Printf.sprintf "Malformed regex \"%s\" in a Regex.replace call" pat
 
   let replace_all s pat sub =
     try
-      Regex_utils.Raw.replace ~all:true pat s sub
+      Regex_utils.Raw.replace ~all:true ~regex:pat ~sub:sub s
     with Regex_utils.Bad_regex ->
       plugin_error @@ Printf.sprintf "Malformed regex \"%s\" in a Regex.replace_all call" pat
 
   let find_all s pat =
     try
-      Regex_utils.Raw.get_matching_strings pat s
+      Regex_utils.Raw.get_matching_strings ~regex:pat s
     with Regex_utils.Bad_regex ->
       plugin_error @@ Printf.sprintf "Malformed regex \"%s\" in a Regex.find_all call" pat
 
   let re_match s pat =
     try
-      Regex_utils.Raw.matches pat s
+      Regex_utils.Raw.matches ~regex:pat s
     with Regex_utils.Bad_regex ->
       plugin_error @@ Printf.sprintf "Malformed regex \"%s\" in a Regex.match call" pat
 
   let split s pat =
     try
-      Regex_utils.Raw.split pat s
+      Regex_utils.Raw.split ~regex:pat s
     with Regex_utils.Bad_regex ->
       plugin_error @@ Printf.sprintf "Malformed regex \"%s\" in a Regex.split call" pat
 end
@@ -789,7 +789,7 @@ struct
          (fun s l -> try Text.sub s 0 l with Invalid_argument _ -> s);
        "truncate_ascii", V.efunc (V.string **-> V.int **->> V.string)
          (fun s l -> try String.sub s 0 l with Invalid_argument _ -> s);
-       "slugify_soft", V.efunc (V.string **->> V.string) (fun s -> Regex_utils.Internal.replace s "\\s+" "-");
+       "slugify_soft", V.efunc (V.string **->> V.string) (fun s -> Regex_utils.Internal.replace ~regex:"\\s+" ~sub:"-" s);
        "slugify_ascii", V.efunc (V.string **->> V.string) Utils.slugify;
        "join", V.efunc (V.string **-> V.list V.string **->> V.string) String.concat;
        "to_number", V.efunc (V.string **->> V.option V.float) (fun s -> try Some (float_of_string s) with _ -> None);

+ 21 - 21
src/regex_utils.ml

@@ -26,21 +26,21 @@ let compile_regex regex =
    (e.g. tell the user which config option bad regex comes from.
  *)
 module Raw = struct
-  let get_matching_strings regex str =
+  let get_matching_strings ~regex str =
     let re = compile_regex regex in
     Re.matches re str
 
-  let matches regex str =
-    let res = get_matching_strings regex str in
+  let matches ~regex str =
+    let res = get_matching_strings ~regex:regex str in
     match res with
     | [] -> false
     | _  -> true
 
-  let replace ?(all=false) regex str sub =
+  let replace ?(all=false) ~regex ~sub str =
     let re = compile_regex regex in
     Re.replace ~all:all ~f:(fun _ -> sub) re str
 
-  let split regex str =
+  let split ~regex str =
     let re = compile_regex regex in
     Re.split re str
 end
@@ -50,20 +50,20 @@ end
    That's why they raise the never-handled Internal_error exception.
  *)
 module Internal = struct
-  let get_matching_strings regex str =
-    try Raw.get_matching_strings regex str
+  let get_matching_strings ~regex str =
+    try Raw.get_matching_strings ~regex:regex str
     with Bad_regex -> internal_error @@ format_error regex
 
-  let matches regex string =
-    try Raw.matches regex string
+  let matches ~regex string =
+    try Raw.matches ~regex:regex string
     with Bad_regex -> internal_error @@ format_error regex
 
-  let replace ?(all=false) regex str sub =
-    try Raw.replace ~all:all regex str sub
+  let replace ?(all=false) ~regex ~sub str =
+    try Raw.replace ~all:all ~regex:regex ~sub:sub str
     with Bad_regex -> internal_error @@ format_error regex
 
-  let split regex string =
-    try Raw.split regex string
+  let split ~regex string =
+    try Raw.split ~regex:regex string
     with Bad_regex -> internal_error @@ format_error regex
 end
 
@@ -72,19 +72,19 @@ end
    and the user can choose whether to ignore page processing error or not.
  *)
 module Public = struct
-  let get_matching_strings regex str =
-    try Raw.get_matching_strings regex str
+  let get_matching_strings ~regex str =
+    try Raw.get_matching_strings ~regex:regex str
     with Bad_regex -> soupault_error @@ format_error regex
 
-  let matches regex string =
-    try Raw.matches regex string
+  let matches ~regex string =
+    try Raw.matches ~regex:regex string
     with Bad_regex -> soupault_error @@ format_error regex
 
-  let replace ?(all=false) regex str sub =
-    try Raw.replace ~all:all regex str sub
+  let replace ?(all=false) ~regex ~sub str =
+    try Raw.replace ~all:all ~regex:regex ~sub str
     with Bad_regex -> soupault_error @@ format_error regex
 
-  let split regex string =
-    try Raw.split regex string
+  let split ~regex string =
+    try Raw.split ~regex:regex string
     with Bad_regex -> soupault_error @@ format_error regex
 end