Rewrote ChapterRecognition to Kotlin. (#293)

This commit is contained in:
Bram van de Kerkhof
2016-05-11 15:33:14 +02:00
committed by inorichi
parent c64bd81339
commit a6df745daa
4 changed files with 561 additions and 403 deletions

View File

@ -1,205 +0,0 @@
package eu.kanade.tachiyomi.util;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.kanade.tachiyomi.data.database.models.Chapter;
import eu.kanade.tachiyomi.data.database.models.Manga;
public final class ChapterRecognition {
private static final Pattern cleanWithToken = Pattern.compile("ch[^0-9]?\\s*(\\d+[\\.,]?\\d+)($|\\b)");
private static final Pattern uncleanWithToken = Pattern.compile("ch[^0-9]?\\s*(\\d+[\\.,]?\\d*)");
private static final Pattern withAlphaPostfix = Pattern.compile("(\\d+[\\.,]?\\d*\\s*)([a-z])($|\\b)");
private static final Pattern cleanNumber = Pattern.compile("(\\d+[\\.,]?\\d+)($|\\b)");
private static final Pattern uncleanNumber = Pattern.compile("(\\d+[\\.,]?\\d*)");
private static final Pattern withColon = Pattern.compile("(\\d+[\\.,]?\\d*\\s*:)([^\\d]|$)");
private static final Pattern startingNumber = Pattern.compile("^(\\d+[\\.,]?\\d*)");
private static final Pattern pUnwanted =
Pattern.compile("(\\b|\\d)(v|ver|vol|version|volume)\\.?\\s*\\d+\\b");
private static final Pattern pPart =
Pattern.compile("(\\b|\\d)part\\s*\\d+.+");
private ChapterRecognition() throws InstantiationException {
throw new InstantiationException("This class is not for instantiation");
}
public static void parseChapterNumber(Chapter chapter, Manga manga) {
if (chapter.chapter_number != -1)
return;
String name = chapter.name.toLowerCase();
Matcher matcher;
// Safest option, the chapter has a token prepended and nothing at the end of the number
matcher = cleanWithToken.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1));
return;
}
// a number with a single alpha prefix is parsed as sub-chapter
matcher = withAlphaPostfix.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1)) + parseAlphaPostFix(matcher.group(2));
return;
}
// the chapter has a token prepended and something at the end of the number
matcher = uncleanWithToken.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1));
return;
}
// Remove anything related to the volume or version
name = pUnwanted.matcher(name).replaceAll("$1");
List<Float> occurrences;
// If there's only one number, use it
matcher = uncleanNumber.matcher(name);
occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0);
return;
}
// If it has a colon, the chapter number should be that one
matcher = withColon.matcher(name);
occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0);
return;
}
// Prefer numbers without anything appended
matcher = cleanNumber.matcher(name);
occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0);
return;
}
// This can lead to issues if two numbers are separated by an space
name = name.replaceAll("\\s+", "");
// Try to remove the manga name from the chapter, and try again
String mangaName = replaceIrrelevantCharacters(manga.title);
String nameWithoutManga = difference(mangaName, name).trim();
if (!nameWithoutManga.isEmpty()) {
matcher = uncleanNumber.matcher(nameWithoutManga);
occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0);
return;
}
}
// TODO more checks (maybe levenshtein?)
// try splitting the name in parts an pick the first valid one
String[] nameParts = chapter.name.split("-");
Chapter dummyChapter = Chapter.create();
if (nameParts.length > 1) {
for (String part : nameParts) {
dummyChapter.name = part;
parseChapterNumber(dummyChapter, manga);
if (dummyChapter.chapter_number >= 0) {
chapter.chapter_number = dummyChapter.chapter_number;
return;
}
}
}
// Strip anything after "part xxx" and try that
matcher = pPart.matcher(name);
if (matcher.find()) {
name = pPart.matcher(name).replaceAll("$1");
dummyChapter.name = name;
parseChapterNumber(dummyChapter, manga);
if (dummyChapter.chapter_number >= 0) {
chapter.chapter_number = dummyChapter.chapter_number;
return;
}
}
// check for a number either at the start or right after the manga title
matcher = startingNumber.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1));
return;
}
matcher = startingNumber.matcher(nameWithoutManga);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1));
return;
}
}
/**
* x.a -> x.1, x.b -> x.2, etc
*/
private static float parseAlphaPostFix(String postfix) {
char alpha = postfix.charAt(0);
return Float.parseFloat("0." + Integer.toString((int)alpha - 96));
}
public static List<Float> getAllOccurrences(Matcher matcher) {
List<Float> occurences = new ArrayList<>();
while (matcher.find()) {
// Match again to get only numbers from the captured text
String text = matcher.group();
Matcher m = uncleanNumber.matcher(text);
if (m.find()) {
try {
Float value = Float.parseFloat(m.group(1).replaceAll(",", "."));
if (!occurences.contains(value)) {
occurences.add(value);
}
} catch (NumberFormatException e) { /* Do nothing */ }
}
}
return occurences;
}
public static String replaceIrrelevantCharacters(String str) {
return str.replaceAll("\\s+", "").toLowerCase();
}
public static String difference(String str1, String str2) {
if (str1 == null) {
return str2;
}
if (str2 == null) {
return str1;
}
int at = indexOfDifference(str1, str2);
if (at == -1) {
return "";
}
return str2.substring(at);
}
public static int indexOfDifference(String str1, String str2) {
if (str1 == str2) {
return -1;
}
if (str1 == null || str2 == null) {
return 0;
}
int i;
for (i = 0; i < str1.length() && i < str2.length(); ++i) {
if (str1.charAt(i) != str2.charAt(i)) {
break;
}
}
if (i < str2.length() || i < str1.length()) {
return i;
}
return -1;
}
}

View File

@ -0,0 +1,140 @@
package eu.kanade.tachiyomi.util
import eu.kanade.tachiyomi.data.database.models.Chapter
import eu.kanade.tachiyomi.data.database.models.Manga
/**
* -R> = regex conversion.
*/
object ChapterRecognition {
/**
* All cases with Ch.xx
* Mokushiroku Alice Vol.1 Ch. 4: Misrepresentation -R> 4
*/
private val basic = Regex("""(?<=ch\.)([0-9]+)(\.[0-9]+)?(\.?[a-z]+)?""")
/**
* Regex used when only one number occurrence
* Example: Bleach 567: Down With Snowwhite -R> 567
*/
private val occurrence = Regex("""([0-9]+)(\.[0-9]+)?(\.?[a-z]+)?""")
/**
* Regex used when manga title removed
* Example: Solanin 028 Vol. 2 -> 028 Vol.2 -> 028Vol.2 -R> 028
*/
private val withoutMange = Regex("""^([0-9]+)(\.[0-9]+)?(\.?[a-z]+)?""")
/**
* Regex used to remove unwanted tags
* Example Prison School 12 v.1 vol004 version1243 volume64 -R> Prison School 12
*/
private val unwanted = Regex("""(?:(v|ver|vol|version|volume).?[0-9]+)""")
private val unwantedWhiteSpace = Regex("""(\s)(extra|special|omake)""")
fun parseChapterNumber(chapter: Chapter, manga: Manga) {
//If chapter number is known return.
if (chapter.chapter_number != -1f)
return
// Get chapter title with lower case
var name = chapter.name.toLowerCase()
// Remove comma's from chapter.
name = name.replace(',', '.')
// Remove unwanted white spaces.
unwantedWhiteSpace.findAll(name).let {
it.forEach { occurrence -> name = name.replace(occurrence.value, occurrence.value.trim()) }
}
// Remove unwanted tags.
unwanted.findAll(name).let {
it.forEach { occurrence -> name = name.replace(occurrence.value, "") }
}
// Check base case ch.xx
if (updateChapter(basic.find(name), chapter))
return
// Check one number occurrence.
val occurrences: MutableList<MatchResult> = arrayListOf()
occurrence.findAll(name).let {
it.forEach { occurrence -> occurrences.add(occurrence) }
}
if (occurrences.size == 1) {
if (updateChapter(occurrences[0], chapter))
return
}
// Remove manga title from chapter title.
val nameWithoutManga = name.replace(manga.title.toLowerCase(), "").trim()
// Check if first value is number after title remove.
if (updateChapter(withoutMange.find(nameWithoutManga), chapter))
return
// Take the first number encountered.
if (updateChapter(occurrence.find(nameWithoutManga), chapter))
return
}
/**
* Check if volume is found and update chapter
* @param match result of regex
* @param chapter chapter object
* @return true if volume is found
*/
fun updateChapter(match: MatchResult?, chapter: Chapter): Boolean {
match?.let {
val initial = it.groups[1]?.value?.toFloat()!!
val subChapterDecimal = it.groups[2]?.value
val subChapterAlpha = it.groups[3]?.value
val addition = checkForDecimal(subChapterDecimal, subChapterAlpha)
chapter.chapter_number = initial.plus(addition)
return true
}
return false
}
/**
* Check for decimal in received strings
* @param decimal decimal value of regex
* @param alpha alpha value of regex
* @return decimal/alpha float value
*/
fun checkForDecimal(decimal: String?, alpha: String?): Float {
if (!decimal.isNullOrEmpty())
return decimal?.toFloat()!!
if (!alpha.isNullOrEmpty()) {
if (alpha!!.contains("extra"))
return .99f
if (alpha.contains("omake"))
return .98f
if (alpha.contains("special"))
return .97f
if (alpha[0].equals('.') ) {
// Take value after (.)
return parseAlphaPostFix(alpha[1])
} else {
return parseAlphaPostFix(alpha[0])
}
}
return .0f
}
/**
* x.a -> x.1, x.b -> x.2, etc
*/
private fun parseAlphaPostFix(alpha: Char): Float {
return ("0." + Integer.toString(alpha.toInt() - 96)).toFloat()
}
}