diff src/core/strings.cc @ 264:9a04802848c0

*: improve multiple things e.g. making some strings.cc functions modify strings in-place, improving m4_ax_have_qt.m4 code, making anime_db.cc rely on std::optional rather than std::shared_ptr (which was stupid anyway)
author Paper <paper@paper.us.eu.org>
date Thu, 11 Apr 2024 10:15:57 -0400
parents dd211ff68b36
children f31305b9f60a
line wrap: on
line diff
--- a/src/core/strings.cc	Wed Apr 03 20:46:40 2024 -0400
+++ b/src/core/strings.cc	Thu Apr 11 10:15:57 2024 -0400
@@ -18,6 +18,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "utf8proc.h"
+
 namespace Strings {
 
 /* ew */
@@ -98,6 +100,38 @@
 		ReplaceAll(string, item.second, item.first);
 }
 
+/* this also performs case folding, so our string is lowercase after this */
+void NormalizeUnicode(std::string& string) {
+	static constexpr utf8proc_option_t options = static_cast<utf8proc_option_t>(
+		UTF8PROC_COMPAT | UTF8PROC_COMPOSE | UTF8PROC_STABLE |
+		UTF8PROC_IGNORE | UTF8PROC_STRIPCC | UTF8PROC_STRIPMARK |
+		UTF8PROC_LUMP | UTF8PROC_CASEFOLD | UTF8PROC_NLF2LS
+	);
+
+	/* ack */
+	utf8proc_uint8_t* buf = nullptr;
+
+	const utf8proc_ssize_t size = utf8proc_map(
+		reinterpret_cast<const utf8proc_uint8_t*>(string.data()),
+		string.size(),
+		&buf,
+		options
+	);
+
+	if (size)
+		string = std::string(reinterpret_cast<const char*>(buf), size);
+
+	if (buf)
+		free(buf);
+}
+
+void NormalizeAnimeTitle(std::string& string) {
+	ConvertRomanNumerals(string);
+	NormalizeUnicode(string);
+	RemoveLeadingChars(string, ' ');
+	RemoveTrailingChars(string, ' ');
+}
+
 /* removes dumb HTML tags because anilist is aids and
  * gives us HTML for synopses :/
  */
@@ -230,14 +264,12 @@
 	return ToInt(str, 0);
 }
 
-std::string RemoveLeadingChars(std::string s, const char c) {
+void RemoveLeadingChars(std::string& s, const char c) {
 	s.erase(0, std::min(s.find_first_not_of(c), s.size() - 1));
-	return s;
 }
 
-std::string RemoveTrailingChars(std::string s, const char c) {
+void RemoveTrailingChars(std::string& s, const char c) {
 	s.erase(s.find_last_not_of(c) + 1, std::string::npos);
-	return s;
 }
 
 bool BeginningMatchesSubstring(const std::string& str, const std::string& sub) {