TobyInkster.co.uk
20/07/2007: Parsing an HTML Table with PEAR’s XML_HTMLSax3
Here’s an example of how to parse an HTML table into an array using the PEAR module XML_HTMLSax3. It supports the <tr>, <td> and <th> elements and the rowspan and colspan attributes.
It’s worth noting that this code will raise a bunch of notices if you run it displaying all errors. This is pretty difficult to avoid, so if you don’t like that, disable the display of notices.
<?php
/**
* Example of how to parse an HTML table using PEAR XML_HTMLSax3.
*
* Copyright (C) 2007 Toby Inkster
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* @author Toby Inkster
* @copyright Copyright (C) 2007 Toby Inkster
* @license http://www.gnu.org/licenses/gpl-3.0.html GNU General Public Licence
*/
/**
* Parser class.
*
* You probably only need to directly access the "Go" method.
*/
class TableParser
{
private $currow = -1;
private $curcol = -1;
private $shape = array();
private $data = array();
public function openHandler ($parser, $tag, $attrs)
{
$tag = strtolower($tag);
// Move to the correct cell co-ordinates.
if ($tag=='tr')
{
$this->currow++;
$this->curcol = -1;
}
elseif ($tag=='td'||$tag=='th')
{
$this->curcol++;
}
// This should account for rowspan and colspan.
while ($this->shape[$this->currow][$this->curcol])
$this->curcol++;
$rowspan = 1;
$colspan = 1;
foreach ($attrs as $k=>$v)
{
$k = strtolower($k);
if ($k=='rowspan')
$rowspan=(int)$v;
elseif ($k=='colspan')
$colspan=(int)$v;
}
for ($i=0; $i<$rowspan; $i++)
for ($j=0; $j<$colspan; $j++)
{
$x = $this->currow + $i;
$y = $this->curcol + $j;
if ($this->shape[$x][$y])
error_log('Overlap!');
$this->shape[$x][$y] = TRUE;
}
}
public function closeHandler ($parser, $tag)
{
}
public function dataHandler ($parser, $data)
{
$this->data[$this->currow][$this->curcol] .= $data;
}
public function getData ()
{
unset($this->data[-1]);
foreach ($this->data as $k=>$v)
unset($this->data[$k][-1]);
return $this->data;
}
public static function Go ($table_html)
{
require_once 'XML/HTMLSax3.php';
$sax = new XML_HTMLSax3;
$hdlr = new TableParser;
$sax->set_object($hdlr);
$sax->set_element_handler('openHandler', 'closeHandler');
$sax->set_data_handler('dataHandler');
$sax->parse($table_html);
return $hdlr->getData();
}
}
$table = '
<table>
<tr>
<td rowspan="2">Test table lalala</td>
<td>123</td>
<td>456</td>
</tr>
<tr>
<td>789</td>
<td>ABC</td>
</tr>
<tr>
<td colspan="2" rowspan="2">123</td>
<td>456</td>
</tr>
<tr>
<td>789</td>
</tr>
</table>
';
print_r(TableParser::Go($table));
?>
Comments
Comment 005
Copying and pasting your code produces this error:
Parse error: parse error, expecting `T_OLD_FUNCTION’ or `T_FUNCTION’ or `T_VAR’ or `’}” in /Users/test/Documents/projects/TableParser.php on line 34
Date: Thursday, 26th July 2007, 5:56pm (BST)
Comment 006
It’s designed for PHP 5. If you’re using PHP 4, you’ll need to massage the code a little.
In particular, replace “private $foo” with “var $foo”; “public function bar()” with “function bar()” and “public static function baz()” with “function baz()”.
Date: Sunday, 29th July 2007, 2:00pm (BST)
Comments are moderated and may take one or two days to show up on the site. You can bypass comment moderation by signing up as a registered user.