Bepis
Bepis
CC#
Created by Bepis on 10/19/2024 in #help
Is there a faster way of splitting strings?
I feel like I've hit a local minimum with my code. Basically it splits strings on certain characters and lowercases the result, eg. "This fair child of mine\nShall sum my count, and make my old excuse" into [this, fair, child, of, mine, shall, sum, my, count, and, make, my, old, excuse] It processes 100MB of text content in 2.33 seconds, but I feel like it should be able to go faster. - It's too much of a hassle to use Memory<char> elsewhere in the code, instead of returning strings - I've tried string pooling instead of constantly allocating new strings, and it actually ends up being slower somehow Code:
static string MemoryToLower(ReadOnlySpan<char> span)
{
Span<char> chars = stackalloc char[span.Length];

for (int i = 0; i < span.Length; i++)
{
char c = span[i];
chars[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
}

return new string(chars);
}

static SearchValues<char> searchValues = SearchValues.Create(new[] { ' ', '\r', '\n', ',', '.', '?', '"', '\'', ';', '!', '\t', '(', ')', '[', ']', '<', '>', '+', '-', '*' });

static List<string> Tokenizer(string input)
{
var startIndex = 0;
var list = new List<string>();

for (var i = 0; i < input.Length; i++)
{
var c = input[i];
if (searchValues.Contains(c))
{
if (i - startIndex > 0)
list.Add(MemoryToLower(input.AsSpan(startIndex, i - startIndex)));

startIndex = i + 1;
}
}
if (input.Length - startIndex > 0)
list.Add(MemoryToLower(input.AsSpan(startIndex, input.Length - startIndex)));

return list;
}
static string MemoryToLower(ReadOnlySpan<char> span)
{
Span<char> chars = stackalloc char[span.Length];

for (int i = 0; i < span.Length; i++)
{
char c = span[i];
chars[i] = (c >= 'A' && c <= 'Z') ? (char)(c + 32) : c;
}

return new string(chars);
}

static SearchValues<char> searchValues = SearchValues.Create(new[] { ' ', '\r', '\n', ',', '.', '?', '"', '\'', ';', '!', '\t', '(', ')', '[', ']', '<', '>', '+', '-', '*' });

static List<string> Tokenizer(string input)
{
var startIndex = 0;
var list = new List<string>();

for (var i = 0; i < input.Length; i++)
{
var c = input[i];
if (searchValues.Contains(c))
{
if (i - startIndex > 0)
list.Add(MemoryToLower(input.AsSpan(startIndex, i - startIndex)));

startIndex = i + 1;
}
}
if (input.Length - startIndex > 0)
list.Add(MemoryToLower(input.AsSpan(startIndex, input.Length - startIndex)));

return list;
}
30 replies