<?php /** * HTML2PDF Library - parsingHtml class * * HTML => PDF convertor * distributed under the LGPL License * * @package Html2pdf * @author Laurent MINGUET <webmaster@html2pdf.fr> * @copyright 2016 Laurent MINGUET */ class HTML2PDF_parsingHtml { protected $_html = ''; // HTML code to parse protected $_num = 0; // table number protected $_level = 0; // table level protected $_encoding = ''; // encoding public $code = array(); // parsed HTML code const HTML_TAB = ' '; /** * main constructor * * @param string $encoding * @access public */ public function __construct($encoding = 'UTF-8') { $this->_num = 0; $this->_level = array($this->_num); $this->_html = ''; $this->code = array(); $this->setEncoding($encoding); } /** * change the encoding * * @param string $encoding * @access public */ public function setEncoding($encoding) { $this->_encoding = $encoding; } /** * Define the HTML code to parse * * @param string $html code * @access public */ public function setHTML($html) { // remove the HTML in comment $html = preg_replace('/<!--(.*)-->/isU', '', $html); // save the HTML code $this->_html = $html; } /** * parse the HTML code * * @access public */ public function parse() { $parents = array(); // flag : are we in a <pre> Tag ? $tagPreIn = false; // action to use for each line of the content of a <pre> Tag $tagPreBr = array( 'name' => 'br', 'close' => false, 'param' => array( 'style' => array(), 'num' => 0 ) ); // tag that can be not closed $tagsNotClosed = array( 'br', 'hr', 'img', 'col', 'input', 'link', 'option', 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline' ); // search the HTML tags $parts = $this->_searchCode(); // all the actions to do $actions = array(); // foreach part of the HTML code foreach ($parts as $part) { // if it is a tag code if ($part[0] == 'code') { // analyze the HTML code $res = $this->_analyzeCode($part[1]); // if it is a real HTML tag if ($res) { // save the current position in the HTML code $res['html_pos'] = $part[2]; // if the tag must be closed if (!in_array($res['name'], $tagsNotClosed)) { // if it is a closure tag if ($res['close']) { // HTML validation if (count($parents) < 1) { throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos'])); } else if (end($parents) != $res['name']) { throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos'])); } else { array_pop($parents); } } else { // if it is an auto-closed tag if ($res['autoclose']) { // save the opened tag $actions[] = $res; // prepare the closed tag $res['params'] = array(); $res['close'] = true; } else { // else: add a child for validation array_push($parents, $res['name']); } } // if it is a <pre> tag (or <code> tag) not auto-closed => update the flag if (($res['name'] == 'pre' || $res['name'] == 'code') && !$res['autoclose']) { $tagPreIn = !$res['close']; } } // save the actions to convert $actions[] = $res; } else { // else (it is not a real HTML tag => we transform it in Text $part[0] = 'txt'; } } // if it is text if ($part[0] == 'txt') { // if we are not in a <pre> tag if (!$tagPreIn) { // save the action $actions[] = array( 'name' => 'write', 'close' => false, 'param' => array('txt' => $this->_prepareTxt($part[1])), ); } else { // else (if we are in a <pre> tag) // prepare the text $part[1] = str_replace("\r", '', $part[1]); $part[1] = explode("\n", $part[1]); // foreach line of the text foreach ($part[1] as $k => $txt) { // transform the line $txt = str_replace("\t", self::HTML_TAB, $txt); $txt = str_replace(' ', ' ', $txt); // add a break line if ($k > 0) { $actions[] = $tagPreBr; } // save the action $actions[] = array( 'name' => 'write', 'close' => false, 'param' => array('txt' => $this->_prepareTxt($txt, false)), ); } } } } // for each identified action, we have to clean up the begin and the end of the texte // based on tags that surround it // list of the tags to clean $tagsToClean = array( 'page', 'page_header', 'page_footer', 'form', 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br', 'div', 'hr', 'p', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'bookmark', 'fieldset', 'legend', 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline', 'option' ); // foreach action $nb = count($actions); for ($k = 0; $k < $nb; $k++) { // if it is a Text if ($actions[$k]['name']=='write') { // if the tag before the text is a tag to clean => ltrim on the text if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean)) $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']); // if the tag after the text is a tag to clean => rtrim on the text if ($k < $nb - 1 && in_array($actions[$k + 1]['name'], $tagsToClean)) $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']); // if the text is empty => remove the action if (!strlen($actions[$k]['param']['txt'])) { unset($actions[$k]); } } } // if we are not on the level 0 => HTML validator ERROR if (count($parents)) { throw new HTML2PDF_exception(5, $parents); } // save the actions to do $this->code = array_values($actions); } /** * prepare the text * * @param string $txt * @param boolean $spaces true => replace multiple space+\t+\r+\n by a single space * @return string txt * @access protected */ protected function _prepareTxt($txt, $spaces = true) { if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt); $txt = str_replace('€', '€', $txt); $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding); return $txt; } /** * parse the HTML code * * @return array */ protected function _searchCode() { // initialise the array $parts = array(); // regexp to separate the tags from the texts $reg = '/(<[^>]+>)|([^<]+)+/isU'; // last match found $str = ''; $offset = 0; // As it finds a match while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) { // if it is a tag if ($parse[1][0]) { // save the previous text if it exists if ($str !== '') { $parts[] = array('txt', $str); } // save the tag, with the offset $parts[] = array('code', trim($parse[1][0]), $offset); // init the current text $str = ''; } else { // else (if it is a text) // add the new text to the current text $str .= $parse[2][0]; } // Update offset to the end of the match $offset = $parse[0][1] + strlen($parse[0][0]); unset($parse); } // if a text is present in the end, we save it if ($str != '') { $parts[] = array('txt', $str); } return $parts; } /** * analise a HTML tag * * @param string $code HTML code to analise * @return array corresponding action */ protected function _analyzeCode($code) { // name of the tag, opening, closure, autoclosure $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)'; if (!preg_match('/'.$tag.'/isU', $code, $match)) { return null; } $close = ($match[1] == '/' ? true : false); $autoclose = preg_match('/\/>$/isU', $code); $name = strtolower($match[2]); // required parameters (depends on the tag name) $param = array(); $param['style'] = ''; if ($name == 'img') { $param['alt'] = ''; $param['src'] = ''; } if ($name == 'a') { $param['href'] = ''; } // read the parameters : name=value $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)'; preg_match_all('/'.$prop.'/is', $code, $match); for ($k = 0; $k < count($match[0]); $k++) { $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); } // read the parameters : name="value" $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]'; preg_match_all('/'.$prop.'/is', $code, $match); for ($k = 0; $k < count($match[0]); $k++) { $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); } // read the parameters : name='value' $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']"; preg_match_all('/'.$prop.'/is', $code, $match); for ($k = 0; $k < count($match[0]); $k++) { $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); } // compliance of each parameter $color = "#000000"; $border = null; foreach ($param as $key => $val) { $key = strtolower($key); switch($key) { case 'width': unset($param[$key]); $param['style'] .= 'width: '.$val.'px; '; break; case 'align': if ($name === 'img') { unset($param[$key]); $param['style'] .= 'float: '.$val.'; '; } elseif ($name !== 'table') { unset($param[$key]); $param['style'] .= 'text-align: '.$val.'; '; } break; case 'valign': unset($param[$key]); $param['style'] .= 'vertical-align: '.$val.'; '; break; case 'height': unset($param[$key]); $param['style'] .= 'height: '.$val.'px; '; break; case 'bgcolor': unset($param[$key]); $param['style'] .= 'background: '.$val.'; '; break; case 'bordercolor': unset($param[$key]); $color = $val; break; case 'border': unset($param[$key]); if (preg_match('/^[0-9]+$/isU', $val)) { $val = $val.'px'; } $border = $val; break; case 'cellpadding': case 'cellspacing': if (preg_match('/^([0-9]+)$/isU', $val)) { $param[$key] = $val.'px'; } break; case 'colspan': case 'rowspan': $val = preg_replace('/[^0-9]/isU', '', $val); if (!$val) { $val = 1; } $param[$key] = $val; break; case 'color': if ($name == 'font') { $param['style'] .= 'color: ' . $val . ';'; } break; } } // compliance of the border if ($border !== null) { if ($border) $border = 'border: solid '.$border.' '.$color; else $border = 'border: none'; $param['style'] .= $border.'; '; $param['border'] = $border; } // reading styles: decomposition and standardization $styles = explode(';', $param['style']); $param['style'] = array(); foreach ($styles as $style) { $tmp = explode(':', $style); if (count($tmp) > 1) { $cod = $tmp[0]; unset($tmp[0]); $tmp = implode(':', $tmp); $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp)); } } // determining the level of table opening, with an added level if (in_array($name, array('ul', 'ol', 'table')) && !$close) { $this->_num++; $this->_level[count($this->_level)] = $this->_num; } // get the level of the table containing the element if (!isset($param['num'])) { $param['num'] = $this->_level[count($this->_level) - 1]; } // for closures table: remove a level if (in_array($name, array('ul', 'ol', 'table')) && $close) { unset($this->_level[count($this->_level) - 1]); } // prepare the parameters if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']); if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']); if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']); if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']); // return the new action to do return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param); } /** * get a full level of HTML, between an opening and closing corresponding * * @param integer $k * @return array actions */ public function getLevel($k) { // if the code does not exist => return empty if (!isset($this->code[$k])) { return array(); } // the tag to detect $detect = $this->code[$k]['name']; // if it is a text => return if ($detect == 'write') { return array($this->code[$k]); } // $level = 0; // depth level $end = false; // end of the search $code = array(); // extract code // while it's not ended while (!$end) { // current action $row = $this->code[$k]; // if 'write' => we add the text if ($row['name']=='write') { $code[] = $row; } else { // else, it is a html tag $not = false; // flag for not taking into account the current tag // if it is the searched tag if ($row['name'] == $detect) { // if we are just at the root level => dont take it if ($level == 0) { $not = true; } // update the level $level+= ($row['close'] ? -1 : 1); // if we are now at the root level => it is the end, and dont take it if ($level == 0) { $not = true; $end = true; } } // if we can take into account the current tag => save it if (!$not) { if (isset($row['style']['text-align'])) { unset($row['style']['text-align']); } $code[] = $row; } } // it continues as long as there has code to analyze if (isset($this->code[$k + 1])) { $k++; } else { $end = true; } } // return the extract return $code; } /** * return a part of the HTML code, for error message * * @param integer $pos * @param integer $before take before * @param integer $after take after * @return string part of the html code */ public function getHtmlErrorCode($pos, $before=30, $after=40) { return substr($this->_html, $pos-$before, $before+$after); } }