defmodule Floki do alias Floki.{Finder, FilterOut, HTMLTree} require Logger @moduledoc """ Floki is a simple HTML parser that enables search for nodes using CSS selectors. ## Example Assuming that you have the following HTML: ```html

Floki

Github page philss
``` To parse this, you can use the function `Floki.parse_document/1`: ```elixir {:ok, html} = Floki.parse_document(doc) # => # [{"html", [], # [ # {"body", [], # [ # {"section", [{"id", "content"}], # [ # {"p", [{"class", "headline"}], ["Floki"]}, # {"a", [{"href", "http://github.com/philss/floki"}], ["Github page"]}, # {"span", [{"data-model", "user"}], ["philss"]} # ]} # ]} # ]}] ``` With this document you can perform queries such as: * `Floki.find(html, "#content")` * `Floki.find(html, ".headline")` * `Floki.find(html, "a")` * `Floki.find(html, "[data-model=user]")` * `Floki.find(html, "#content a")` * `Floki.find(html, ".headline, a")` Each HTML node is represented by a tuple like: {tag_name, attributes, children_nodes} Example of node: {"p", [{"class", "headline"}], ["Floki"]} So even if the only child node is the element text, it is represented inside a list. """ @type html_attribute :: {String.t(), String.t()} @type html_attributes :: [html_attribute()] | html_attributes_map() @type html_attributes_map :: %{String.t() => String.t()} @type html_declaration :: {:pi, String.t(), html_attributes()} @type html_comment :: {:comment, String.t()} @type html_doctype :: {:doctype, String.t(), String.t(), String.t()} @type html_text :: String.t() @type html_tag :: {String.t(), html_attributes(), [html_node()]} @type html_node :: html_tag() | html_comment() | html_doctype() | html_declaration() | html_text() @type html_tree :: [html_node()] @type css_selector :: String.t() | %Floki.Selector{} | [%Floki.Selector{}] defguard is_html_node(value) when is_binary(value) or tuple_size(value) == 3 or (tuple_size(value) == 2 and elem(value, 0) in [:pi, :comment]) or (tuple_size(value) == 4 and elem(value, 0) == :doctype) @doc """ Parses a HTML Document from a String. The expect string is a valid HTML, but the parser will try to parse even with errors. """ @spec parse(binary()) :: html_tag() | html_tree() | String.t() @deprecated "Use `parse_document/1` or `parse_fragment/1` instead." def parse(html) do with {:ok, document} <- Floki.HTMLParser.parse_document(html) do if length(document) == 1 do hd(document) else document end end end @doc """ Parses an HTML document from a string. This is the main function to get a tree from an HTML string. ## Options * `:attributes_as_maps` - Change the behaviour of the parser to return the attributes as maps, instead of a list of `{"key", "value"}`. Default to `false`. * `:html_parser` - The module of the backend that is responsible for parsing the HTML string. By default it is set to the built-in parser, and the module name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the application env of the same name. See https://github.com/philss/floki#alternative-html-parsers for more details. * `:parser_args` - A list of options to the parser. This can be used to pass options that are specific for a given parser. Defaults to an empty list. ## Examples iex> Floki.parse_document("hello") {:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]} iex> Floki.parse_document("hello", html_parser: Floki.HTMLParser.Mochiweb) {:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]} iex> Floki.parse_document( ...> "hello", ...> attributes_as_maps: true, ...> html_parser: Floki.HTMLParser.Mochiweb ...>) {:ok, [{"html", %{}, [{"head", %{}, []}, {"body", %{"class" => "main"}, ["hello"]}]}]} """ @spec parse_document(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()} defdelegate parse_document(document, opts \\ []), to: Floki.HTMLParser @doc """ Parses a HTML Document from a string. Similar to `Floki.parse_document/1`, but raises `Floki.ParseError` if there was an error parsing the document. ## Example iex> Floki.parse_document!("hello") [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}] """ @spec parse_document!(binary(), Keyword.t()) :: html_tree() def parse_document!(document, opts \\ []) do case parse_document(document, opts) do {:ok, parsed_document} -> parsed_document {:error, message} -> raise Floki.ParseError, message: message end end @doc """ Parses an HTML fragment from a string. This is mostly for parsing sections of an HTML document. ## Options * `:attributes_as_maps` - Change the behaviour of the parser to return the attributes as maps, instead of a list of `{"key", "value"}`. Remember that maps are no longer ordered since OTP 26. Default to `false`. * `:html_parser` - The module of the backend that is responsible for parsing the HTML string. By default it is set to the built-in parser, and the module name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the application env of the same name. See https://github.com/philss/floki#alternative-html-parsers for more details. * `:parser_args` - A list of options to the parser. This can be used to pass options that are specific for a given parser. Defaults to an empty list. """ @spec parse_fragment(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()} defdelegate parse_fragment(fragment, opts \\ []), to: Floki.HTMLParser @doc """ Parses a HTML fragment from a string. Similar to `Floki.parse_fragment/1`, but raises `Floki.ParseError` if there was an error parsing the fragment. """ @spec parse_fragment!(binary(), Keyword.t()) :: html_tree() def parse_fragment!(fragment, opts \\ []) do case parse_fragment(fragment, opts) do {:ok, parsed_fragment} -> parsed_fragment {:error, message} -> raise Floki.ParseError, message: message end end @doc """ Converts HTML tree to raw HTML. Note that the resultant HTML may be different from the original one. Spaces after tags and doctypes are ignored. ## Options * `:encode` - A boolean option to control if special HTML characters should be encoded as HTML entities. Defaults to `true`. You can also control the encoding behaviour at the application level via `config :floki, :encode_raw_html, false` * `:pretty` - Controls if the output should be formatted, ignoring breaklines and spaces from the input and putting new ones in order to pretty format the html. Defaults to `false`. ## Examples iex> Floki.raw_html({"div", [{"class", "wrapper"}], ["my content"]}) ~s(
my content
) iex> Floki.raw_html({"div", [{"class", "wrapper"}], ["10 > 5"]}) ~s(
10 > 5
) iex> Floki.raw_html({"div", [{"class", "wrapper"}], ["10 > 5"]}, encode: false) ~s(
10 > 5
) iex> Floki.raw_html({"div", [], ["\\n ", {"span", [], "Fully indented"}, " \\n"]}, pretty: true) \"\"\"
Fully indented
\"\"\" """ @spec raw_html(html_tree | binary, keyword) :: binary defdelegate raw_html(html_tree, options \\ []), to: Floki.RawHTML @doc """ Find elements inside an HTML tree or string. ## Examples iex> {:ok, html} = Floki.parse_fragment("

hello

") iex> Floki.find(html, ".hint") [{"span", [{"class", "hint"}], ["hello"]}] iex> {:ok, html} = Floki.parse_fragment("
Content
") iex> Floki.find(html, "#important") [{"div", [{"id", "important"}], [{"div", [], ["Content"]}]}] iex> {:ok, html} = Floki.parse_fragment("

Google

") iex> Floki.find(html, "a") [{"a", [{"href", "https://google.com"}], ["Google"]}] iex> Floki.find([{ "div", [], [{"a", [{"href", "https://google.com"}], ["Google"]}]}], "div a") [{"a", [{"href", "https://google.com"}], ["Google"]}] """ @spec find(binary() | html_tree() | html_node(), css_selector()) :: html_tree def find(html, selector) when is_binary(html) do Logger.info( "deprecation: parse the HTML with parse_document or parse_fragment before using find/2" ) with {:ok, document} <- Floki.parse_document(html) do Finder.find(document, selector) end end def find(html_tree_as_tuple, selector) when is_list(html_tree_as_tuple) or is_html_node(html_tree_as_tuple) do Finder.find(html_tree_as_tuple, selector) end @doc """ Finds the first element in an HTML tree by id. Returns `nil` if no element is found. This is useful when there are IDs that contain special characters that are invalid when passed as is as a CSS selector. It is similar to the `getElementById` method in the browser. ## Examples iex> {:ok, html} = Floki.parse_fragment(~s[

hello

]) iex> Floki.get_by_id(html, "id?foo_special:chars") {"span", [{"class", "hint"}, {"id", "id?foo_special:chars"}], ["hello"]} iex> Floki.get_by_id(html, "does-not-exist") nil """ @spec get_by_id(html_tree() | html_node(), String.t()) :: html_tree def get_by_id(html_tree_as_tuple, id) when is_list(html_tree_as_tuple) or is_html_node(html_tree_as_tuple) do html_tree_as_tuple |> Finder.find(%Floki.Selector{id: id}) |> List.first() end @doc """ Changes the attribute values of the elements matched by `selector` with the function `mutation` and returns the whole element tree. ## Examples iex> Floki.attr([{"div", [{"id", "a"}], []}], "#a", "id", fn(id) -> String.replace(id, "a", "b") end) [{"div", [{"id", "b"}], []}] iex> Floki.attr([{"div", [{"class", "name"}], []}], "div", "id", fn _ -> "b" end) [{"div", [{"id", "b"}, {"class", "name"}], []}] """ @spec attr(binary | html_tree | html_node, css_selector(), binary, (binary -> binary)) :: html_tree def attr(html_elem_tuple, selector, attribute_name, mutation) when is_tuple(html_elem_tuple) do attr([html_elem_tuple], selector, attribute_name, mutation) end def attr(html, selector, attribute_name, mutation) when is_binary(html) do Logger.info( "deprecation: parse the HTML with parse_document or parse_fragment before using attr/4" ) with {:ok, document} <- Floki.parse_document(html) do attr(document, selector, attribute_name, mutation) end end def attr(html_tree_list, selector, attribute_name, mutation) when is_list(html_tree_list) do find_and_update(html_tree_list, selector, fn {tag, attrs} -> modified_attrs = if Enum.any?(attrs, &match?({^attribute_name, _}, &1)) do Enum.map( attrs, fn attribute -> with {^attribute_name, attribute_value} <- attribute do {attribute_name, mutation.(attribute_value)} end end ) else [{attribute_name, mutation.(nil)} | attrs] end {tag, modified_attrs} other -> other end) end @deprecated """ Use `find_and_update/3` or `Enum.map/2` instead. """ def map(_html_tree_or_list, _fun) def map(html_tree_list, fun) when is_list(html_tree_list) do Enum.map(html_tree_list, &Finder.map(&1, fun)) end def map(html_tree, fun), do: Finder.map(html_tree, fun) @doc """ Searches for elements inside the HTML tree and update those that matches the selector. It will return the updated HTML tree. This function works in a way similar to `traverse_and_update`, but instead of updating the children nodes, it will only updates the `tag` and `attributes` of the matching nodes. If `fun` returns `:delete`, the HTML node will be removed from the tree. ## Examples iex> Floki.find_and_update([{"a", [{"href", "http://elixir-lang.com"}], ["Elixir"]}], "a", fn iex> {"a", [{"href", href}]} -> iex> {"a", [{"href", String.replace(href, "http://", "https://")}]} iex> other -> iex> other iex> end) [{"a", [{"href", "https://elixir-lang.com"}], ["Elixir"]}] """ @spec find_and_update( html_tree(), css_selector(), ({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete) ) :: html_tree() def find_and_update(html_tree, selector, fun) do tree = HTMLTree.build(html_tree) results = Finder.find(tree, selector) operations_with_nodes = Enum.map(results, fn html_node = %Floki.HTMLTree.HTMLNode{} -> case fun.({html_node.type, html_node.attributes}) do {updated_tag, updated_attrs} -> {:update, %{html_node | type: updated_tag, attributes: updated_attrs}} :delete -> {:delete, html_node} end other -> {:no_op, other} end) tree |> HTMLTree.patch_nodes(operations_with_nodes) |> HTMLTree.to_tuple_list() end @doc """ Traverses and updates a HTML tree structure. This function returns a new tree structure that is the result of applying the given `fun` on all nodes except text nodes. The tree is traversed in a post-walk fashion, where the children are traversed before the parent. When the function `fun` encounters HTML tag, it receives a tuple with `{name, attributes, children}`, and should either return a similar tuple, a list of tuples to split current node or `nil` to delete it. The function `fun` can also encounter HTML doctype, comment or declaration and will receive, and should return, different tuple for these types. See the documentation for `t:html_comment/0`, `t:html_doctype/0` and `t:html_declaration/0` for details. **Note**: this won't update text nodes, but you can transform them when working with children nodes. ## Examples iex> html = [{"div", [], ["hello"]}] iex> Floki.traverse_and_update(html, fn ...> {"div", attrs, children} -> {"p", attrs, children} ...> other -> other ...> end) [{"p", [], ["hello"]}] iex> html = [{"div", [], [{:comment, "I am comment"}, {"span", [], ["hello"]}]}] iex> Floki.traverse_and_update(html, fn ...> {"span", _attrs, _children} -> nil ...> {:comment, text} -> {"span", [], text} ...> other -> other ...> end) [{"div", [], [{"span", [], "I am comment"}]}] """ @spec traverse_and_update( html_node() | html_tree(), (html_node() -> html_node() | [html_node()] | nil) ) :: html_node() | html_tree() defdelegate traverse_and_update(html_tree, fun), to: Floki.Traversal @doc """ Traverses and updates a HTML tree structure with an accumulator. This function returns a new tree structure and the final value of accumulator which are the result of applying the given `fun` on all nodes except text nodes. The tree is traversed in a post-walk fashion, where the children are traversed before the parent. When the function `fun` encounters HTML tag, it receives a tuple with `{name, attributes, children}` and an accumulator. It and should return a 2-tuple like `{new_node, new_acc}`, where `new_node` is either a similar tuple or `nil` to delete the current node, and `new_acc` is an updated value for the accumulator. The function `fun` can also encounter HTML doctype, comment or declaration and will receive, and should return, different tuple for these types. See the documentation for `t:html_comment/0`, `t:html_doctype/0` and `t:html_declaration/0` for details. **Note**: this won't update text nodes, but you can transform them when working with children nodes. ## Examples iex> html = [{"div", [], [{:comment, "I am a comment"}, "hello"]}, {"div", [], ["world"]}] iex> Floki.traverse_and_update(html, 0, fn ...> {"div", attrs, children}, acc -> ...> {{"p", [{"data-count", to_string(acc)} | attrs], children}, acc + 1} ...> other, acc -> {other, acc} ...> end) {[ {"p", [{"data-count", "0"}], [{:comment, "I am a comment"}, "hello"]}, {"p", [{"data-count", "1"}], ["world"]} ], 2} iex> html = {"div", [], [{"span", [], ["hello"]}]} iex> Floki.traverse_and_update(html, [deleted: 0], fn ...> {"span", _attrs, _children}, acc -> ...> {nil, Keyword.put(acc, :deleted, acc[:deleted] + 1)} ...> tag, acc -> ...> {tag, acc} ...> end) {{"div", [], []}, [deleted: 1]} """ @spec traverse_and_update( html_node() | html_tree(), traverse_acc, (html_node(), traverse_acc -> {html_node() | [html_node()] | nil, traverse_acc}) ) :: {html_node() | html_tree(), traverse_acc} when traverse_acc: any() defdelegate traverse_and_update(html_tree, acc, fun), to: Floki.Traversal @doc """ Returns the text nodes from a HTML tree. By default, it will perform a deep search through the HTML tree. You can disable deep search with the option `deep` assigned to false. You can include content of script tags with the option `js` assigned to true. You can specify a separator between nodes content. ## Options * `:deep` - A boolean option to control how deep the search for text is going to be. If `false`, only the level of the HTML node or the first level of the HTML document is going to be considered. Defaults to `true`. * `:js` - A boolean option to control if the contents of script tags should be considered as text. Defaults to `false`. * `:sep` - A separator string that is added between text nodes. Defaults to `""`. * `:include_inputs` - A boolean to control if `` or `