C
C#2y ago
malkav

❔ ✅ Scrape all <table> </table> elements

I'm trying to use AngleSharp to scrape any url (that is given as parameter) for all table elements on that page's HTML code, and parse the contents into a json. Here's an example of what I'm trying to achieve:
<table>
<thead>
<tr>
<th scope="col">Header 1</th>
<th scope="col">Header 2</th>
<!-- etc -->
</tr>
</thead>
<tbody>
<tr>
<th>1</th>
<td>data</td>
<!-- etc -->
</tr>
<!-- etc -->
</tbody>
</table>
<table>
<thead>
<tr>
<th scope="col">Header 1</th>
<th scope="col">Header 2</th>
<!-- etc -->
</tr>
</thead>
<tbody>
<tr>
<th>1</th>
<td>data</td>
<!-- etc -->
</tr>
<!-- etc -->
</tbody>
</table>
the json output:
[
{
"Header 1": "1",
"Header 2": "data",
// etc
},
// etc
]
[
{
"Header 1": "1",
"Header 2": "data",
// etc
},
// etc
]
The code I'm trying so far seems to make it hard to reach this actual element, even when trying to use QuerySelectorAll("table")
19 Replies
malkav
malkav2y ago
using HttpClient client = new HttpClient();
string htmlCode = await client.GetStringAsync(data.Url);

HtmlParser parser = new();
IHtmlDocument document = parser.ParseDocument(htmlCode);

var elements = document.QuerySelectorAll("table");

foreach (var element in elements.Select(e => e.Attributes))
{

}
using HttpClient client = new HttpClient();
string htmlCode = await client.GetStringAsync(data.Url);

HtmlParser parser = new();
IHtmlDocument document = parser.ParseDocument(htmlCode);

var elements = document.QuerySelectorAll("table");

foreach (var element in elements.Select(e => e.Attributes))
{

}
This is where I'm stuck at atm..
Pobiega
Pobiega2y ago
Does it properly find all tables? Because the next step would be to find all<tbody><tr> elements children, and zip them with <tbody><tr> children. that would give you N pairs of elements, each body row column zipped with the corresponding header had some spare time, so I tried it. works just fine
[
{
"Header 1": "1",
"Header 2": "data"
}
]
[
{
"Header 1": "1",
"Header 2": "data"
}
]
is my output for the above pasted html
FusedQyou
FusedQyou2y ago
I did this with typescript, so I assume c# would be similar. If you have the table elements, you should just be able to get all child elements in it, which should guarantee to just be one thead and one tbody. You could then query all th tags from the thead, use that as the keys, and collect all tr tags from the tbody to use whatever is in there as the values? Not sure where you're stuck tho
Pobiega
Pobiega2y ago
var content = File.ReadAllText("angle.html");
var document = new HtmlParser().ParseDocument(content);

var tables = document.QuerySelectorAll("table");

var result = new List<Dictionary<string, string>>();

foreach (var table in tables)
{
var head = table.QuerySelector("thead");
var body = table.QuerySelector("tbody");

if (head == null)
{
Console.WriteLine("Table without thead found!");
continue;
}

if (body == null)
{
Console.WriteLine("Table without tbody found!");
continue;
}

var headRows = head.QuerySelectorAll("th");
var allBodyRows = body.QuerySelectorAll("tr");

var tableResults = GetTableResults(headRows, allBodyRows);

result.Add(tableResults);
}

var json = SerializeWithIndentation(result);
var content = File.ReadAllText("angle.html");
var document = new HtmlParser().ParseDocument(content);

var tables = document.QuerySelectorAll("table");

var result = new List<Dictionary<string, string>>();

foreach (var table in tables)
{
var head = table.QuerySelector("thead");
var body = table.QuerySelector("tbody");

if (head == null)
{
Console.WriteLine("Table without thead found!");
continue;
}

if (body == null)
{
Console.WriteLine("Table without tbody found!");
continue;
}

var headRows = head.QuerySelectorAll("th");
var allBodyRows = body.QuerySelectorAll("tr");

var tableResults = GetTableResults(headRows, allBodyRows);

result.Add(tableResults);
}

var json = SerializeWithIndentation(result);
The good stuff is in the GetTableResults method, which I leave as an exercise to the reader.
malkav
malkav2y ago
I've switched from the AngleSharp library to the HtmlAgilityPack library, and this is what I have so far. But this does not recursively find inner tables and such (yet):
using HttpClient client = new HttpClient();
string htmlCode = await client.GetStringAsync(data.Url);
HtmlDocument doc = new();
doc.LoadHtml(htmlCode);
HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table");
List<List<string>> rObj = new();
foreach (HtmlNode table in tables)
{
HtmlNode tableHead = table.SelectSingleNode("thead");
HtmlNode tableBody = table.SelectSingleNode("tbody");
HtmlNodeCollection rows = tableHead.SelectNodes("tr");
HtmlNode keySection = rows.FirstOrDefault();
HtmlNodeCollection keys = keySection?.SelectNodes("th | td");
HtmlNodeCollection vRows = tableBody.SelectNodes("tr");
HtmlNodeCollection[] dataRows = vRows.Select(row => row.SelectNodes("th | td")).ToArray();
List<Dictionary<string, string>> generatedRows =
(from dataRow
in dataRows
from dr
in dataRow
.Select((e, i) => new { Index = i, Element = e })
select new Dictionary<string, string> { { keys![dr.Index].InnerHtml, dr.Element.InnerHtml } })
.ToList();
rObj.Add(generatedRows
.SelectMany(item => item)
.Select(kvp => $"{kvp.Key}: {kvp.Value}")
.ToList());
}

return new OkObjectResult(rObj);
using HttpClient client = new HttpClient();
string htmlCode = await client.GetStringAsync(data.Url);
HtmlDocument doc = new();
doc.LoadHtml(htmlCode);
HtmlNodeCollection tables = doc.DocumentNode.SelectNodes("//table");
List<List<string>> rObj = new();
foreach (HtmlNode table in tables)
{
HtmlNode tableHead = table.SelectSingleNode("thead");
HtmlNode tableBody = table.SelectSingleNode("tbody");
HtmlNodeCollection rows = tableHead.SelectNodes("tr");
HtmlNode keySection = rows.FirstOrDefault();
HtmlNodeCollection keys = keySection?.SelectNodes("th | td");
HtmlNodeCollection vRows = tableBody.SelectNodes("tr");
HtmlNodeCollection[] dataRows = vRows.Select(row => row.SelectNodes("th | td")).ToArray();
List<Dictionary<string, string>> generatedRows =
(from dataRow
in dataRows
from dr
in dataRow
.Select((e, i) => new { Index = i, Element = e })
select new Dictionary<string, string> { { keys![dr.Index].InnerHtml, dr.Element.InnerHtml } })
.ToList();
rObj.Add(generatedRows
.SelectMany(item => item)
.Select(kvp => $"{kvp.Key}: {kvp.Value}")
.ToList());
}

return new OkObjectResult(rObj);
(Note: Removed comments for space in chat) but I need to make this recursive so that it finds nested tables
Pobiega
Pobiega2y ago
your sample html didnt include any nested tables but the idea remains the same in theory
malkav
malkav2y ago
No that's true, but it was a quick example of what I needed to fetch, it is possible with any given URL that there are nested ones 😅 with my current code I do get the results I expected, just some of the results have a string of new <table> elements. So I need now to figure a way to make this recursive in a way..
Pobiega
Pobiega2y ago
well, making it recursive causes some issues since now a value can be either a string, or a new object (a new table) if you only ever intend to use this as json, thats fine (just make the dictionary be <string,object>)
malkav
malkav2y ago
It's an Azure Function that should get a URL as parameter, scrapes the URL for all the HTML code, then takes only <table> elements (all of them) and parses those into a Dictionary<string, string> but I guess <string, object> would work better if there is indeed nested stuff hrm... Right now I am returning a List<string> instead, but changing that into Dictionary is not hard. Lemme see if I can figure this out Oh, right, the reason I used a List<string> is because I keep getting tables with the same column keys 😅 I'm using this URL as example table to extract: https://getbootstrap.com/docs/5.3/content/tables/ but there are plenty of tables in there that use "Heading" as table head "key"
Pobiega
Pobiega2y ago
Right. Well, then your "expected" json is also incorrect
[
{
"Header 1": "1",
"Header 2": "data"
}
]
[
{
"Header 1": "1",
"Header 2": "data"
}
]
this is a list of objects, where each object is actually a collection of key value pairs. json doesnt allow duplicate keys so if those objects are actually just lists of strings, it would be...
[
[
"Header 1: 1",
"Header 2: data"
]
]
[
[
"Header 1: 1",
"Header 2: data"
]
]
which is fine, but thats a whole different story I changed my code a bit to reflect these changes.
[
[
{
"Header 1": "1",
"Header 2": "data"
},
{
"Header 1": "2",
"Header 2": "data2"
}
]
]
[
[
{
"Header 1": "1",
"Header 2": "data"
},
{
"Header 1": "2",
"Header 2": "data2"
}
]
]
is now my suggested output outer list is a list of tables. inner list is the list of rows in that table. the object is a row. this would also allow recursive lists, in theory
malkav
malkav2y ago
I think I came to the same result... Here's a snippet of the result I got now:
[
[
{
"#": "1",
"First": "Mark",
"Last": "Otto",
"Handle": "@mdo"
},
{
"#": "2",
"First": "Jacob",
"Last": "Thornton",
"Handle": "@fat"
},
{
"#": "3",
"First": "Larry the Bird",
"Last": "@twitter"
}
],
[
[
{
"#": "1",
"First": "Mark",
"Last": "Otto",
"Handle": "@mdo"
},
{
"#": "2",
"First": "Jacob",
"Last": "Thornton",
"Handle": "@fat"
},
{
"#": "3",
"First": "Larry the Bird",
"Last": "@twitter"
}
],
Pobiega
Pobiega2y ago
yep
malkav
malkav2y ago
the code I use for this:
private static List<Dictionary<string, object>> GenerateTableList(HtmlNode node)
{
// Get the table head first
// This is where the Keys for our JSON will come from
HtmlNode tableHead = node.SelectSingleNode("thead");
// Get the table body first
// This is where all the values for our JSON will come from
HtmlNode tableBody = node.SelectSingleNode("tbody");

// Grab all tr items in thead, this should be just one, but always check
HtmlNodeCollection rows = tableHead.SelectNodes("tr");
// From the table-row we select the first item
HtmlNode keySection = rows.FirstOrDefault();
// Then select all theader or tdata elements for the keys in our collection
HtmlNodeCollection keys = keySection?.SelectNodes("th | td");

// Do the same for the values
HtmlNodeCollection vRows = tableBody.SelectNodes("tr");
HtmlNodeCollection[] dataRows = vRows.Select(row => row.SelectNodes("th | td")).ToArray();

List<Dictionary<string, object>> generatedRows = new();
foreach (HtmlNodeCollection dataRow in dataRows)
{
Dictionary<string, object> rowData = new();
foreach (var cell in dataRow.Select((e, i) => new { Index = i, Element = e }))
{
HtmlNode currentNode = keys![cell.Index];
HtmlNode currentCell = cell.Element;
HtmlNodeCollection tablesInCell = currentCell.SelectNodes("table");
if (tablesInCell != null && tablesInCell.Any())
{
rowData[currentNode.InnerHtml] = tablesInCell.Select(GenerateTableList).ToList();
}
else
{
rowData[currentNode.InnerHtml] = currentCell.InnerHtml;
}
}
generatedRows.Add(rowData);
}

return generatedRows;
}
private static List<Dictionary<string, object>> GenerateTableList(HtmlNode node)
{
// Get the table head first
// This is where the Keys for our JSON will come from
HtmlNode tableHead = node.SelectSingleNode("thead");
// Get the table body first
// This is where all the values for our JSON will come from
HtmlNode tableBody = node.SelectSingleNode("tbody");

// Grab all tr items in thead, this should be just one, but always check
HtmlNodeCollection rows = tableHead.SelectNodes("tr");
// From the table-row we select the first item
HtmlNode keySection = rows.FirstOrDefault();
// Then select all theader or tdata elements for the keys in our collection
HtmlNodeCollection keys = keySection?.SelectNodes("th | td");

// Do the same for the values
HtmlNodeCollection vRows = tableBody.SelectNodes("tr");
HtmlNodeCollection[] dataRows = vRows.Select(row => row.SelectNodes("th | td")).ToArray();

List<Dictionary<string, object>> generatedRows = new();
foreach (HtmlNodeCollection dataRow in dataRows)
{
Dictionary<string, object> rowData = new();
foreach (var cell in dataRow.Select((e, i) => new { Index = i, Element = e }))
{
HtmlNode currentNode = keys![cell.Index];
HtmlNode currentCell = cell.Element;
HtmlNodeCollection tablesInCell = currentCell.SelectNodes("table");
if (tablesInCell != null && tablesInCell.Any())
{
rowData[currentNode.InnerHtml] = tablesInCell.Select(GenerateTableList).ToList();
}
else
{
rowData[currentNode.InnerHtml] = currentCell.InnerHtml;
}
}
generatedRows.Add(rowData);
}

return generatedRows;
}
I have a feeling it's messy 😅
Pobiega
Pobiega2y ago
Not sure why you swapped from Anglesharp to HAP
malkav
malkav2y ago
Because I got confused at AngleSharp.. Every time I used the QuerySelectorAll("table") method, all my key-value pairs kept being null, or just empty strings. I approached it similarly to this bit of code, but couldn't get any results
Pobiega
Pobiega2y ago
weird. works absolutely fine for me and its faster and more modern, and has a nicer API (imho) 😛
malkav
malkav2y ago
I mean, I would probably agree with you 😅 but I just have no idea why it didn't want to give results, so I switched. Maybe when I start refactoring and using more like this I'll give AngleSharp another go thanks for the help here!
Pobiega
Pobiega2y ago
private static List<Dictionary<string, object>>? ParseTable(IElement table)
{
var head = table.QuerySelector("thead");
var body = table.QuerySelector("tbody");

if (head == null)
{
Console.WriteLine("Table without thead found!");
return null;
}

if (body == null)
{
Console.WriteLine("Table without tbody found!");
return null;
}

var headRows = head.QuerySelectorAll("th");
var allBodyRows = body.QuerySelectorAll("tr");

var tableResults = GetTableResults(headRows, allBodyRows).ToList();
return tableResults;
}

private static IEnumerable<Dictionary<string, object>> GetTableResults(
IHtmlCollection<IElement> headRows,
IEnumerable<IElement> allBodyRows)
{
foreach (var row in allBodyRows)
{
var resultRow = new Dictionary<string, object>();
foreach (var zip in row.Children.Zip(headRows))
{
var tables = zip.First.QuerySelectorAll("table");

var header = zip.Second.TextContent;
if (tables.Any())
{
var x = tables.Select(ParseTable).ToList();
resultRow.Add(header, x);
}
else
{
resultRow.Add(header, zip.First.TextContent);
}
}

yield return resultRow;
}
}
private static List<Dictionary<string, object>>? ParseTable(IElement table)
{
var head = table.QuerySelector("thead");
var body = table.QuerySelector("tbody");

if (head == null)
{
Console.WriteLine("Table without thead found!");
return null;
}

if (body == null)
{
Console.WriteLine("Table without tbody found!");
return null;
}

var headRows = head.QuerySelectorAll("th");
var allBodyRows = body.QuerySelectorAll("tr");

var tableResults = GetTableResults(headRows, allBodyRows).ToList();
return tableResults;
}

private static IEnumerable<Dictionary<string, object>> GetTableResults(
IHtmlCollection<IElement> headRows,
IEnumerable<IElement> allBodyRows)
{
foreach (var row in allBodyRows)
{
var resultRow = new Dictionary<string, object>();
foreach (var zip in row.Children.Zip(headRows))
{
var tables = zip.First.QuerySelectorAll("table");

var header = zip.Second.TextContent;
if (tables.Any())
{
var x = tables.Select(ParseTable).ToList();
resultRow.Add(header, x);
}
else
{
resultRow.Add(header, zip.First.TextContent);
}
}

yield return resultRow;
}
}
is what I ended up with. handles recursive tables probably needs more error handling to handle malformed tables ie, where the number of headers and cols dont line up etc
Accord
Accord2y ago
Was this issue resolved? If so, run /close - otherwise I will mark this as stale and this post will be archived until there is new activity.
Want results from more Discord servers?
Add your server
More Posts