2015-02-22 11:38:28 -05:00
< ? php
/**
* Class HTML_To_Markdown
*
* A helper class to convert HTML to Markdown .
*
2015-02-22 14:26:56 -05:00
* @ version 2.2 . 1
2015-02-22 11:38:28 -05:00
* @ author Nick Cernis < nick @ cern . is >
* @ link https :// github . com / nickcernis / html2markdown / Latest version on GitHub .
* @ link http :// twitter . com / nickcernis Nick on twitter .
* @ license http :// www . opensource . org / licenses / mit - license . php MIT
*/
class HTML_To_Markdown
{
/**
* @ var DOMDocument The root of the document tree that holds our HTML .
*/
private $document ;
/**
* @ var string | boolean The Markdown version of the original HTML , or false if conversion failed
*/
private $output ;
/**
* @ var array Class - wide options users can override .
*/
private $options = array (
'header_style' => 'setext' , // Set to "atx" to output H1 and H2 headers as # Header1 and ## Header2
'suppress_errors' => true , // Set to false to show warnings when loading malformed HTML
'strip_tags' => false , // Set to true to strip tags that don't have markdown equivalents. N.B. Strips tags, not their content. Useful to clean MS Word HTML output.
'bold_style' => '**' , // Set to '__' if you prefer the underlined style
'italic_style' => '*' , // Set to '_' if you prefer the underlined style
'remove_nodes' => '' , // space-separated list of dom nodes that should be removed. example: "meta style script"
);
/**
* Constructor
*
* Set up a new DOMDocument from the supplied HTML , convert it to Markdown , and store it in $this -> $output .
*
* @ param string $html The HTML to convert to Markdown .
* @ param array $overrides [ optional ] List of style and error display overrides .
*/
public function __construct ( $html = null , $overrides = null )
{
if ( $overrides )
$this -> options = array_merge ( $this -> options , $overrides );
if ( $html )
$this -> convert ( $html );
}
/**
* Setter for conversion options
*
* @ param $name
* @ param $value
*/
public function set_option ( $name , $value )
{
$this -> options [ $name ] = $value ;
}
/**
* Convert
*
* Loads HTML and passes to get_markdown ()
*
* @ param $html
* @ return string The Markdown version of the html
*/
public function convert ( $html )
{
$html = preg_replace ( '~>\s+<~' , '><' , $html ); // Strip white space between tags to prevent creation of empty #text nodes
$this -> document = new DOMDocument ();
if ( $this -> options [ 'suppress_errors' ])
libxml_use_internal_errors ( true ); // Suppress conversion errors (from http://bit.ly/pCCRSX )
$this -> document -> loadHTML ( '<?xml encoding="UTF-8">' . $html ); // Hack to load utf-8 HTML (from http://bit.ly/pVDyCt )
$this -> document -> encoding = 'UTF-8' ;
if ( $this -> options [ 'suppress_errors' ])
libxml_clear_errors ();
return $this -> get_markdown ( $html );
}
/**
* Is Child Of ?
*
* Is the node a child of the given parent tag ?
*
2015-02-22 14:26:56 -05:00
* @ param $parent_name string | array The name of the parent node ( s ) to search for e . g . 'code' or array ( 'pre' , 'code' )
2015-02-22 11:38:28 -05:00
* @ param $node
* @ return bool
*/
private static function is_child_of ( $parent_name , $node )
{
for ( $p = $node -> parentNode ; $p != false ; $p = $p -> parentNode ) {
if ( is_null ( $p ))
return false ;
2015-02-22 14:26:56 -05:00
if ( is_array ( $parent_name ) && in_array ( $p -> nodeName , $parent_name ) )
return true ;
2015-02-22 11:38:28 -05:00
if ( $p -> nodeName == $parent_name )
return true ;
}
return false ;
}
/**
* Convert Children
*
* Recursive function to drill into the DOM and convert each node into Markdown from the inside out .
*
* Finds children of each node and convert those to #text nodes containing their Markdown equivalent,
* starting with the innermost element and working up to the outermost element .
*
* @ param $node
*/
private function convert_children ( $node )
{
// Don't convert HTML code inside <code> and <pre> blocks to Markdown - that should stay as HTML
2015-02-22 14:26:56 -05:00
if ( self :: is_child_of ( array ( 'pre' , 'code' ), $node ))
2015-02-22 11:38:28 -05:00
return ;
// If the node has children, convert those to Markdown first
if ( $node -> hasChildNodes ()) {
$length = $node -> childNodes -> length ;
for ( $i = 0 ; $i < $length ; $i ++ ) {
$child = $node -> childNodes -> item ( $i );
$this -> convert_children ( $child );
}
}
// Now that child nodes have been converted, convert the original node
$markdown = $this -> convert_to_markdown ( $node );
// Create a DOM text node containing the Markdown equivalent of the original node
$markdown_node = $this -> document -> createTextNode ( $markdown );
// Replace the old $node e.g. "<h3>Title</h3>" with the new $markdown_node e.g. "### Title"
$node -> parentNode -> replaceChild ( $markdown_node , $node );
}
/**
* Get Markdown
*
* Sends the body node to convert_children () to change inner nodes to Markdown #text nodes, then saves and
* returns the resulting converted document as a string in Markdown format .
*
* @ return string | boolean The converted HTML as Markdown , or false if conversion failed
*/
private function get_markdown ()
{
// Work on the entire DOM tree (including head and body)
$input = $this -> document -> getElementsByTagName ( " html " ) -> item ( 0 );
if ( ! $input )
return false ;
// Convert all children of this root element. The DOMDocument stored in $this->doc will
// then consist of #text nodes, each containing a Markdown version of the original node
// that it replaced.
$this -> convert_children ( $input );
// Sanitize and return the body contents as a string.
$markdown = $this -> document -> saveHTML (); // stores the DOMDocument as a string
$markdown = html_entity_decode ( $markdown , ENT_QUOTES , 'UTF-8' );
$markdown = html_entity_decode ( $markdown , ENT_QUOTES , 'UTF-8' ); // Double decode to cover cases like &nbsp; http://www.php.net/manual/en/function.htmlentities.php#99984
$markdown = preg_replace ( " /<!DOCTYPE [^>]+>/ " , " " , $markdown ); // Strip doctype declaration
$unwanted = array ( '<html>' , '</html>' , '<body>' , '</body>' , '<head>' , '</head>' , '<?xml encoding="UTF-8">' , '
' );
$markdown = str_replace ( $unwanted , '' , $markdown ); // Strip unwanted tags
$markdown = trim ( $markdown , " \n \r \0 \x0B " );
$this -> output = $markdown ;
return $markdown ;
}
/**
* Convert to Markdown
*
* Converts an individual node into a #text node containing a string of its Markdown equivalent.
*
* Example : An < h3 > node with text content of " Title " becomes a text node with content of " ### Title "
*
* @ param $node
* @ return string The converted HTML as Markdown
*/
private function convert_to_markdown ( $node )
{
$tag = $node -> nodeName ; // the type of element, e.g. h1
$value = $node -> nodeValue ; // the value of that element, e.g. The Title
// Strip nodes named in remove_nodes
$tags_to_remove = explode ( ' ' , $this -> options [ 'remove_nodes' ]);
if ( in_array ( $tag , $tags_to_remove ) )
return false ;
switch ( $tag ) {
case " p " :
$markdown = ( trim ( $value )) ? rtrim ( $value ) . PHP_EOL . PHP_EOL : '' ;
break ;
case " pre " :
$markdown = PHP_EOL . $this -> convert_code ( $node ) . PHP_EOL ;
break ;
case " h1 " :
case " h2 " :
$markdown = $this -> convert_header ( $tag , $node );
break ;
case " h3 " :
$markdown = " ### " . $value . PHP_EOL . PHP_EOL ;
break ;
case " h4 " :
$markdown = " #### " . $value . PHP_EOL . PHP_EOL ;
break ;
case " h5 " :
$markdown = " ##### " . $value . PHP_EOL . PHP_EOL ;
break ;
case " h6 " :
$markdown = " ###### " . $value . PHP_EOL . PHP_EOL ;
break ;
case " em " :
case " i " :
case " strong " :
case " b " :
$markdown = $this -> convert_emphasis ( $tag , $value );
break ;
case " hr " :
$markdown = " - - - - - - " . PHP_EOL . PHP_EOL ;
break ;
case " br " :
$markdown = " " . PHP_EOL ;
break ;
case " blockquote " :
$markdown = $this -> convert_blockquote ( $node );
break ;
case " code " :
$markdown = $this -> convert_code ( $node );
break ;
case " ol " :
case " ul " :
$markdown = $value . PHP_EOL ;
break ;
case " li " :
$markdown = $this -> convert_list ( $node );
break ;
case " img " :
$markdown = $this -> convert_image ( $node );
break ;
case " a " :
$markdown = $this -> convert_anchor ( $node );
break ;
case " #text " :
$markdown = preg_replace ( '~\s+~' , ' ' , $value );
$markdown = preg_replace ( '~^#~' , '\\\\#' , $markdown );
break ;
case " #comment " :
$markdown = '' ;
break ;
case " div " :
$markdown = ( $this -> options [ 'strip_tags' ]) ? $value . PHP_EOL . PHP_EOL : html_entity_decode ( $node -> C14N ());
break ;
default :
// If strip_tags is false (the default), preserve tags that don't have Markdown equivalents,
// such as <span> nodes on their own. C14N() canonicalizes the node to a string.
// See: http://www.php.net/manual/en/domnode.c14n.php
$markdown = ( $this -> options [ 'strip_tags' ]) ? $value : html_entity_decode ( $node -> C14N ());
}
return $markdown ;
}
/**
* Convert Header
*
* Converts h1 and h2 headers to Markdown - style headers in setext style ,
* matching the number of underscores with the length of the title .
*
* e . g . Header 1 Header Two
* ======== ----------
*
* Returns atx headers instead if $this -> options [ 'header_style' ] is " atx "
*
* e . g . # Header 1 ## Header Two
*
* @ param string $level The header level , including the " h " . e . g . h1
* @ param string $node The node to convert .
* @ return string The Markdown version of the header .
*/
private function convert_header ( $level , $node )
{
$content = $node -> nodeValue ;
if ( ! $this -> is_child_of ( 'blockquote' , $node ) && $this -> options [ 'header_style' ] == " setext " ) {
$length = ( function_exists ( 'mb_strlen' )) ? mb_strlen ( $content , 'utf-8' ) : strlen ( $content );
$underline = ( $level == " h1 " ) ? " = " : " - " ;
$markdown = $content . PHP_EOL . str_repeat ( $underline , $length ) . PHP_EOL . PHP_EOL ; // setext style
} else {
$prefix = ( $level == " h1 " ) ? " # " : " ## " ;
$markdown = $prefix . $content . PHP_EOL . PHP_EOL ; // atx style
}
return $markdown ;
}
/**
* Converts inline styles
* This function is used to render strong and em tags
*
* eg < strong > bold text </ strong > becomes ** bold text ** or __bold text__
*
* @ param string $tag
* @ param string $value
* @ return string
*/
private function convert_emphasis ( $tag , $value )
{
if ( $tag == 'i' || $tag == 'em' ) {
$markdown = $this -> options [ 'italic_style' ] . $value . $this -> options [ 'italic_style' ];
} else {
$markdown = $this -> options [ 'bold_style' ] . $value . $this -> options [ 'bold_style' ];
}
return $markdown ;
}
/**
* Convert Image
*
* Converts < img /> tags to Markdown .
*
* e . g . < img src = " /path/img.jpg " alt = " alt text " title = " Title " />
* becomes ! [ alt text ]( / path / img . jpg " Title " )
*
* @ param $node
* @ return string
*/
private function convert_image ( $node )
{
$src = $node -> getAttribute ( 'src' );
$alt = $node -> getAttribute ( 'alt' );
$title = $node -> getAttribute ( 'title' );
if ( $title != " " ) {
$markdown = '![' . $alt . '](' . $src . ' "' . $title . '")' ; // No newlines added. <img> should be in a block-level element.
} else {
$markdown = '![' . $alt . '](' . $src . ')' ;
}
return $markdown ;
}
/**
* Convert Anchor
*
* Converts < a > tags to Markdown .
*
* e . g . < a href = " http://modernnerd.net " title = " Title " > Modern Nerd </ a >
* becomes [ Modern Nerd ]( http :// modernnerd . net " Title " )
*
* @ param $node
* @ return string
*/
private function convert_anchor ( $node )
{
$href = $node -> getAttribute ( 'href' );
$title = $node -> getAttribute ( 'title' );
$text = $node -> nodeValue ;
if ( $title != " " ) {
$markdown = '[' . $text . '](' . $href . ' "' . $title . '")' ;
} else {
$markdown = '[' . $text . '](' . $href . ')' ;
}
2015-02-22 14:26:56 -05:00
if ( ! $href )
$markdown = html_entity_decode ( $node -> C14N ());
2015-02-22 11:38:28 -05:00
// Append a space if the node after this one is also an anchor
$next_node_name = $this -> get_next_node_name ( $node );
if ( $next_node_name == 'a' )
$markdown = $markdown . ' ' ;
return $markdown ;
}
/**
* Convert List
*
* Converts < ul > and < ol > lists to Markdown .
*
* @ param $node
* @ return string
*/
private function convert_list ( $node )
{
// If parent is an ol, use numbers, otherwise, use dashes
$list_type = $node -> parentNode -> nodeName ;
$value = $node -> nodeValue ;
if ( $list_type == " ul " ) {
$markdown = " - " . trim ( $value ) . PHP_EOL ;
} else {
$number = $this -> get_position ( $node );
$markdown = $number . " . " . trim ( $value ) . PHP_EOL ;
}
return $markdown ;
}
/**
* Convert Code
*
* Convert code tags by indenting blocks of code and wrapping single lines in backticks .
*
* @ param DOMNode $node
* @ return string
*/
private function convert_code ( $node )
{
// Store the content of the code block in an array, one entry for each line
$markdown = '' ;
2015-02-22 14:26:56 -05:00
$code_content = html_entity_decode ( $node -> C14N ());
2015-02-22 11:38:28 -05:00
$code_content = str_replace ( array ( " <code> " , " </code> " ), " " , $code_content );
$code_content = str_replace ( array ( " <pre> " , " </pre> " ), " " , $code_content );
$lines = preg_split ( '/\r\n|\r|\n/' , $code_content );
$total = count ( $lines );
// If there's more than one line of code, prepend each line with four spaces and no backticks.
if ( $total > 1 || $node -> nodeName === 'pre' ) {
// Remove the first and last line if they're empty
$first_line = trim ( $lines [ 0 ]);
$last_line = trim ( $lines [ $total - 1 ]);
$first_line = trim ( $first_line , " 
 " ); //trim XML style carriage returns too
$last_line = trim ( $last_line , " 
 " );
if ( empty ( $first_line ))
array_shift ( $lines );
if ( empty ( $last_line ))
array_pop ( $lines );
$count = 1 ;
foreach ( $lines as $line ) {
$line = str_replace ( '
' , '' , $line );
$markdown .= " " . $line ;
// Add newlines, except final line of the code
if ( $count != $total )
$markdown .= PHP_EOL ;
$count ++ ;
}
$markdown .= PHP_EOL ;
} else { // There's only one line of code. It's a code span, not a block. Just wrap it with backticks.
$markdown .= " ` " . $lines [ 0 ] . " ` " ;
}
return $markdown ;
}
/**
* Convert blockquote
*
* Prepend blockquotes with > chars .
*
* @ param $node
* @ return string
*/
private function convert_blockquote ( $node )
{
// Contents should have already been converted to Markdown by this point,
// so we just need to add ">" symbols to each line.
$markdown = '' ;
$quote_content = trim ( $node -> nodeValue );
$lines = preg_split ( '/\r\n|\r|\n/' , $quote_content );
$total_lines = count ( $lines );
foreach ( $lines as $i => $line ) {
$markdown .= " > " . $line . PHP_EOL ;
if ( $i + 1 == $total_lines )
$markdown .= PHP_EOL ;
}
return $markdown ;
}
/**
* Get Position
*
* Returns the numbered position of a node inside its parent
*
* @ param $node
* @ return int The numbered position of the node , starting at 1.
*/
private function get_position ( $node )
{
// Get all of the nodes inside the parent
$list_nodes = $node -> parentNode -> childNodes ;
$total_nodes = $list_nodes -> length ;
$position = 1 ;
// Loop through all nodes and find the given $node
for ( $a = 0 ; $a < $total_nodes ; $a ++ ) {
$current_node = $list_nodes -> item ( $a );
if ( $current_node -> isSameNode ( $node ))
$position = $a + 1 ;
}
return $position ;
}
/**
* Get Next Node Name
*
* Return the name of the node immediately after the passed one .
*
* @ param $node
* @ return string | null The node name ( e . g . 'h1' ) or null .
*/
private function get_next_node_name ( $node )
{
$next_node_name = null ;
$current_position = $this -> get_position ( $node );
$next_node = $node -> parentNode -> childNodes -> item ( $current_position );
if ( $next_node )
$next_node_name = $next_node -> nodeName ;
return $next_node_name ;
}
/**
* To String
*
* Magic method to return Markdown output when HTML_To_Markdown instance is treated as a string .
*
* @ return string
*/
public function __toString ()
{
return $this -> output ();
}
/**
* Output
*
* Getter for the converted Markdown contents stored in $this -> output
*
* @ return string
*/
public function output ()
{
if ( ! $this -> output ) {
return '' ;
} else {
return $this -> output ;
}
}
}