* @copyright 2016 Laurent MINGUET
*/
class HTML2PDF_parsingHtml
{
protected $_html = ''; // HTML code to parse
protected $_num = 0; // table number
protected $_level = 0; // table level
protected $_encoding = ''; // encoding
public $code = array(); // parsed HTML code
const HTML_TAB = ' ';
/**
* main constructor
*
* @param string $encoding
* @access public
*/
public function __construct($encoding = 'UTF-8')
{
$this->_num = 0;
$this->_level = array($this->_num);
$this->_html = '';
$this->code = array();
$this->setEncoding($encoding);
}
/**
* change the encoding
*
* @param string $encoding
* @access public
*/
public function setEncoding($encoding)
{
$this->_encoding = $encoding;
}
/**
* Define the HTML code to parse
*
* @param string $html code
* @access public
*/
public function setHTML($html)
{
// remove the HTML in comment
$html = preg_replace('//isU', '', $html);
// save the HTML code
$this->_html = $html;
}
/**
* parse the HTML code
*
* @access public
*/
public function parse()
{
$parents = array();
// flag : are we in a Tag ?
$tagPreIn = false;
// action to use for each line of the content of a Tag
$tagPreBr = array(
'name' => 'br',
'close' => false,
'param' => array(
'style' => array(),
'num' => 0
)
);
// tag that can be not closed
$tagsNotClosed = array(
'br', 'hr', 'img', 'col',
'input', 'link', 'option',
'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
);
// search the HTML tags
$parts = $this->_searchCode();
// all the actions to do
$actions = array();
// foreach part of the HTML code
foreach ($parts as $part) {
// if it is a tag code
if ($part[0] == 'code') {
// analyze the HTML code
$res = $this->_analyzeCode($part[1]);
// if it is a real HTML tag
if ($res) {
// save the current position in the HTML code
$res['html_pos'] = $part[2];
// if the tag must be closed
if (!in_array($res['name'], $tagsNotClosed)) {
// if it is a closure tag
if ($res['close']) {
// HTML validation
if (count($parents) < 1) {
throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
} else if (end($parents) != $res['name']) {
throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
} else {
array_pop($parents);
}
} else {
// if it is an auto-closed tag
if ($res['autoclose']) {
// save the opened tag
$actions[] = $res;
// prepare the closed tag
$res['params'] = array();
$res['close'] = true;
} else {
// else: add a child for validation
array_push($parents, $res['name']);
}
}
// if it is a tag (or tag) not auto-closed => update the flag
if (($res['name'] == 'pre' || $res['name'] == 'code') && !$res['autoclose']) {
$tagPreIn = !$res['close'];
}
}
// save the actions to convert
$actions[] = $res;
} else { // else (it is not a real HTML tag => we transform it in Text
$part[0] = 'txt';
}
}
// if it is text
if ($part[0] == 'txt') {
// if we are not in a tag
if (!$tagPreIn) {
// save the action
$actions[] = array(
'name' => 'write',
'close' => false,
'param' => array('txt' => $this->_prepareTxt($part[1])),
);
} else { // else (if we are in a tag)
// prepare the text
$part[1] = str_replace("\r", '', $part[1]);
$part[1] = explode("\n", $part[1]);
// foreach line of the text
foreach ($part[1] as $k => $txt) {
// transform the line
$txt = str_replace("\t", self::HTML_TAB, $txt);
$txt = str_replace(' ', ' ', $txt);
// add a break line
if ($k > 0) {
$actions[] = $tagPreBr;
}
// save the action
$actions[] = array(
'name' => 'write',
'close' => false,
'param' => array('txt' => $this->_prepareTxt($txt, false)),
);
}
}
}
}
// for each identified action, we have to clean up the begin and the end of the texte
// based on tags that surround it
// list of the tags to clean
$tagsToClean = array(
'page', 'page_header', 'page_footer', 'form',
'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
'div', 'hr', 'p', 'ul', 'ol', 'li',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'bookmark', 'fieldset', 'legend',
'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
'option'
);
// foreach action
$nb = count($actions);
for ($k = 0; $k < $nb; $k++) {
// if it is a Text
if ($actions[$k]['name']=='write') {
// if the tag before the text is a tag to clean => ltrim on the text
if ($k>0 && in_array($actions[$k - 1]['name'], $tagsToClean))
$actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
// if the tag after the text is a tag to clean => rtrim on the text
if ($k < $nb - 1 && in_array($actions[$k + 1]['name'], $tagsToClean))
$actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
// if the text is empty => remove the action
if (!strlen($actions[$k]['param']['txt'])) {
unset($actions[$k]);
}
}
}
// if we are not on the level 0 => HTML validator ERROR
if (count($parents)) {
throw new HTML2PDF_exception(5, $parents);
}
// save the actions to do
$this->code = array_values($actions);
}
/**
* prepare the text
*
* @param string $txt
* @param boolean $spaces true => replace multiple space+\t+\r+\n by a single space
* @return string txt
* @access protected
*/
protected function _prepareTxt($txt, $spaces = true)
{
if ($spaces) $txt = preg_replace('/\s+/isu', ' ', $txt);
$txt = str_replace('€', '€', $txt);
$txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
return $txt;
}
/**
* parse the HTML code
*
* @return array
*/
protected function _searchCode()
{
// initialise the array
$parts = array();
// regexp to separate the tags from the texts
$reg = '/(<[^>]+>)|([^<]+)+/isU';
// last match found
$str = '';
$offset = 0;
// As it finds a match
while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
// if it is a tag
if ($parse[1][0]) {
// save the previous text if it exists
if ($str !== '') {
$parts[] = array('txt', $str);
}
// save the tag, with the offset
$parts[] = array('code', trim($parse[1][0]), $offset);
// init the current text
$str = '';
} else { // else (if it is a text)
// add the new text to the current text
$str .= $parse[2][0];
}
// Update offset to the end of the match
$offset = $parse[0][1] + strlen($parse[0][0]);
unset($parse);
}
// if a text is present in the end, we save it
if ($str != '') {
$parts[] = array('txt', $str);
}
return $parts;
}
/**
* analise a HTML tag
*
* @param string $code HTML code to analise
* @return array corresponding action
*/
protected function _analyzeCode($code)
{
// name of the tag, opening, closure, autoclosure
$tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
if (!preg_match('/'.$tag.'/isU', $code, $match)) {
return null;
}
$close = ($match[1] == '/' ? true : false);
$autoclose = preg_match('/\/>$/isU', $code);
$name = strtolower($match[2]);
// required parameters (depends on the tag name)
$param = array();
$param['style'] = '';
if ($name == 'img') {
$param['alt'] = '';
$param['src'] = '';
}
if ($name == 'a') {
$param['href'] = '';
}
// read the parameters : name=value
$prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
preg_match_all('/'.$prop.'/is', $code, $match);
for ($k = 0; $k < count($match[0]); $k++) {
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
}
// read the parameters : name="value"
$prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
preg_match_all('/'.$prop.'/is', $code, $match);
for ($k = 0; $k < count($match[0]); $k++) {
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
}
// read the parameters : name='value'
$prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
preg_match_all('/'.$prop.'/is', $code, $match);
for ($k = 0; $k < count($match[0]); $k++) {
$param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
}
// compliance of each parameter
$color = "#000000";
$border = null;
foreach ($param as $key => $val) {
$key = strtolower($key);
switch($key)
{
case 'width':
unset($param[$key]);
$param['style'] .= 'width: '.$val.'px; ';
break;
case 'align':
if ($name === 'img') {
unset($param[$key]);
$param['style'] .= 'float: '.$val.'; ';
} elseif ($name !== 'table') {
unset($param[$key]);
$param['style'] .= 'text-align: '.$val.'; ';
}
break;
case 'valign':
unset($param[$key]);
$param['style'] .= 'vertical-align: '.$val.'; ';
break;
case 'height':
unset($param[$key]);
$param['style'] .= 'height: '.$val.'px; ';
break;
case 'bgcolor':
unset($param[$key]);
$param['style'] .= 'background: '.$val.'; ';
break;
case 'bordercolor':
unset($param[$key]);
$color = $val;
break;
case 'border':
unset($param[$key]);
if (preg_match('/^[0-9]+$/isU', $val)) {
$val = $val.'px';
}
$border = $val;
break;
case 'cellpadding':
case 'cellspacing':
if (preg_match('/^([0-9]+)$/isU', $val)) {
$param[$key] = $val.'px';
}
break;
case 'colspan':
case 'rowspan':
$val = preg_replace('/[^0-9]/isU', '', $val);
if (!$val) {
$val = 1;
}
$param[$key] = $val;
break;
case 'color':
if ($name == 'font') {
$param['style'] .= 'color: ' . $val . ';';
}
break;
}
}
// compliance of the border
if ($border !== null) {
if ($border) $border = 'border: solid '.$border.' '.$color;
else $border = 'border: none';
$param['style'] .= $border.'; ';
$param['border'] = $border;
}
// reading styles: decomposition and standardization
$styles = explode(';', $param['style']);
$param['style'] = array();
foreach ($styles as $style) {
$tmp = explode(':', $style);
if (count($tmp) > 1) {
$cod = $tmp[0];
unset($tmp[0]);
$tmp = implode(':', $tmp);
$param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
}
}
// determining the level of table opening, with an added level
if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
$this->_num++;
$this->_level[count($this->_level)] = $this->_num;
}
// get the level of the table containing the element
if (!isset($param['num'])) {
$param['num'] = $this->_level[count($this->_level) - 1];
}
// for closures table: remove a level
if (in_array($name, array('ul', 'ol', 'table')) && $close) {
unset($this->_level[count($this->_level) - 1]);
}
// prepare the parameters
if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']);
if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']);
if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']);
if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']);
// return the new action to do
return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
}
/**
* get a full level of HTML, between an opening and closing corresponding
*
* @param integer $k
* @return array actions
*/
public function getLevel($k)
{
// if the code does not exist => return empty
if (!isset($this->code[$k])) {
return array();
}
// the tag to detect
$detect = $this->code[$k]['name'];
// if it is a text => return
if ($detect == 'write') {
return array($this->code[$k]);
}
//
$level = 0; // depth level
$end = false; // end of the search
$code = array(); // extract code
// while it's not ended
while (!$end) {
// current action
$row = $this->code[$k];
// if 'write' => we add the text
if ($row['name']=='write') {
$code[] = $row;
} else { // else, it is a html tag
$not = false; // flag for not taking into account the current tag
// if it is the searched tag
if ($row['name'] == $detect) {
// if we are just at the root level => dont take it
if ($level == 0) {
$not = true;
}
// update the level
$level+= ($row['close'] ? -1 : 1);
// if we are now at the root level => it is the end, and dont take it
if ($level == 0) {
$not = true;
$end = true;
}
}
// if we can take into account the current tag => save it
if (!$not) {
if (isset($row['style']['text-align'])) {
unset($row['style']['text-align']);
}
$code[] = $row;
}
}
// it continues as long as there has code to analyze
if (isset($this->code[$k + 1])) {
$k++;
} else {
$end = true;
}
}
// return the extract
return $code;
}
/**
* return a part of the HTML code, for error message
*
* @param integer $pos
* @param integer $before take before
* @param integer $after take after
* @return string part of the html code
*/
public function getHtmlErrorCode($pos, $before=30, $after=40)
{
return substr($this->_html, $pos-$before, $before+$after);
}
}