Floki
Github page philssdefmodule Floki do alias Floki.{Finder, FilterOut, HTMLTree} require Logger @moduledoc """ Floki is a simple HTML parser that enables search for nodes using CSS selectors. ## Example Assuming that you have the following HTML: ```html
Floki
Github page philsshello
") iex> Floki.find(html, ".hint") [{"span", [{"class", "hint"}], ["hello"]}] iex> {:ok, html} = Floki.parse_fragment("hello
]) iex> Floki.get_by_id(html, "id?foo_special:chars") {"span", [{"class", "hint"}, {"id", "id?foo_special:chars"}], ["hello"]} iex> Floki.get_by_id(html, "does-not-exist") nil """ @spec get_by_id(html_tree() | html_node(), String.t()) :: html_tree def get_by_id(html_tree_as_tuple, id) when is_list(html_tree_as_tuple) or is_html_node(html_tree_as_tuple) do html_tree_as_tuple |> Finder.find(%Floki.Selector{id: id}) |> List.first() end @doc """ Changes the attribute values of the elements matched by `selector` with the function `mutation` and returns the whole element tree. ## Examples iex> Floki.attr([{"div", [{"id", "a"}], []}], "#a", "id", fn(id) -> String.replace(id, "a", "b") end) [{"div", [{"id", "b"}], []}] iex> Floki.attr([{"div", [{"class", "name"}], []}], "div", "id", fn _ -> "b" end) [{"div", [{"id", "b"}, {"class", "name"}], []}] """ @spec attr(binary | html_tree | html_node, css_selector(), binary, (binary -> binary)) :: html_tree def attr(html_elem_tuple, selector, attribute_name, mutation) when is_tuple(html_elem_tuple) do attr([html_elem_tuple], selector, attribute_name, mutation) end def attr(html, selector, attribute_name, mutation) when is_binary(html) do Logger.info( "deprecation: parse the HTML with parse_document or parse_fragment before using attr/4" ) with {:ok, document} <- Floki.parse_document(html) do attr(document, selector, attribute_name, mutation) end end def attr(html_tree_list, selector, attribute_name, mutation) when is_list(html_tree_list) do find_and_update(html_tree_list, selector, fn {tag, attrs} -> modified_attrs = if Enum.any?(attrs, &match?({^attribute_name, _}, &1)) do Enum.map( attrs, fn attribute -> with {^attribute_name, attribute_value} <- attribute do {attribute_name, mutation.(attribute_value)} end end ) else [{attribute_name, mutation.(nil)} | attrs] end {tag, modified_attrs} other -> other end) end @deprecated """ Use `find_and_update/3` or `Enum.map/2` instead. """ def map(_html_tree_or_list, _fun) def map(html_tree_list, fun) when is_list(html_tree_list) do Enum.map(html_tree_list, &Finder.map(&1, fun)) end def map(html_tree, fun), do: Finder.map(html_tree, fun) @doc """ Searches for elements inside the HTML tree and update those that matches the selector. It will return the updated HTML tree. This function works in a way similar to `traverse_and_update`, but instead of updating the children nodes, it will only updates the `tag` and `attributes` of the matching nodes. If `fun` returns `:delete`, the HTML node will be removed from the tree. ## Examples iex> Floki.find_and_update([{"a", [{"href", "http://elixir-lang.com"}], ["Elixir"]}], "a", fn iex> {"a", [{"href", href}]} -> iex> {"a", [{"href", String.replace(href, "http://", "https://")}]} iex> other -> iex> other iex> end) [{"a", [{"href", "https://elixir-lang.com"}], ["Elixir"]}] """ @spec find_and_update( html_tree(), css_selector(), ({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete) ) :: html_tree() def find_and_update(html_tree, selector, fun) do tree = HTMLTree.build(html_tree) results = Finder.find(tree, selector) operations_with_nodes = Enum.map(results, fn html_node = %Floki.HTMLTree.HTMLNode{} -> case fun.({html_node.type, html_node.attributes}) do {updated_tag, updated_attrs} -> {:update, %{html_node | type: updated_tag, attributes: updated_attrs}} :delete -> {:delete, html_node} end other -> {:no_op, other} end) tree |> HTMLTree.patch_nodes(operations_with_nodes) |> HTMLTree.to_tuple_list() end @doc """ Traverses and updates a HTML tree structure. This function returns a new tree structure that is the result of applying the given `fun` on all nodes except text nodes. The tree is traversed in a post-walk fashion, where the children are traversed before the parent. When the function `fun` encounters HTML tag, it receives a tuple with `{name, attributes, children}`, and should either return a similar tuple, a list of tuples to split current node or `nil` to delete it. The function `fun` can also encounter HTML doctype, comment or declaration and will receive, and should return, different tuple for these types. See the documentation for `t:html_comment/0`, `t:html_doctype/0` and `t:html_declaration/0` for details. **Note**: this won't update text nodes, but you can transform them when working with children nodes. ## Examples iex> html = [{"div", [], ["hello"]}] iex> Floki.traverse_and_update(html, fn ...> {"div", attrs, children} -> {"p", attrs, children} ...> other -> other ...> end) [{"p", [], ["hello"]}] iex> html = [{"div", [], [{:comment, "I am comment"}, {"span", [], ["hello"]}]}] iex> Floki.traverse_and_update(html, fn ...> {"span", _attrs, _children} -> nil ...> {:comment, text} -> {"span", [], text} ...> other -> other ...> end) [{"div", [], [{"span", [], "I am comment"}]}] """ @spec traverse_and_update( html_node() | html_tree(), (html_node() -> html_node() | [html_node()] | nil) ) :: html_node() | html_tree() defdelegate traverse_and_update(html_tree, fun), to: Floki.Traversal @doc """ Traverses and updates a HTML tree structure with an accumulator. This function returns a new tree structure and the final value of accumulator which are the result of applying the given `fun` on all nodes except text nodes. The tree is traversed in a post-walk fashion, where the children are traversed before the parent. When the function `fun` encounters HTML tag, it receives a tuple with `{name, attributes, children}` and an accumulator. It and should return a 2-tuple like `{new_node, new_acc}`, where `new_node` is either a similar tuple or `nil` to delete the current node, and `new_acc` is an updated value for the accumulator. The function `fun` can also encounter HTML doctype, comment or declaration and will receive, and should return, different tuple for these types. See the documentation for `t:html_comment/0`, `t:html_doctype/0` and `t:html_declaration/0` for details. **Note**: this won't update text nodes, but you can transform them when working with children nodes. ## Examples iex> html = [{"div", [], [{:comment, "I am a comment"}, "hello"]}, {"div", [], ["world"]}] iex> Floki.traverse_and_update(html, 0, fn ...> {"div", attrs, children}, acc -> ...> {{"p", [{"data-count", to_string(acc)} | attrs], children}, acc + 1} ...> other, acc -> {other, acc} ...> end) {[ {"p", [{"data-count", "0"}], [{:comment, "I am a comment"}, "hello"]}, {"p", [{"data-count", "1"}], ["world"]} ], 2} iex> html = {"div", [], [{"span", [], ["hello"]}]} iex> Floki.traverse_and_update(html, [deleted: 0], fn ...> {"span", _attrs, _children}, acc -> ...> {nil, Keyword.put(acc, :deleted, acc[:deleted] + 1)} ...> tag, acc -> ...> {tag, acc} ...> end) {{"div", [], []}, [deleted: 1]} """ @spec traverse_and_update( html_node() | html_tree(), traverse_acc, (html_node(), traverse_acc -> {html_node() | [html_node()] | nil, traverse_acc}) ) :: {html_node() | html_tree(), traverse_acc} when traverse_acc: any() defdelegate traverse_and_update(html_tree, acc, fun), to: Floki.Traversal @doc """ Returns the text nodes from a HTML tree. By default, it will perform a deep search through the HTML tree. You can disable deep search with the option `deep` assigned to false. You can include content of script tags with the option `js` assigned to true. You can specify a separator between nodes content. ## Options * `:deep` - A boolean option to control how deep the search for text is going to be. If `false`, only the level of the HTML node or the first level of the HTML document is going to be considered. Defaults to `true`. * `:js` - A boolean option to control if the contents of script tags should be considered as text. Defaults to `false`. * `:sep` - A separator string that is added between text nodes. Defaults to `""`. * `:include_inputs` - A boolean to control if `` or `