Merge pull request #130 from icewind1991/chapter-number-parsing

Improve chapter number parsing
This commit is contained in:
inorichi 2016-02-09 21:50:32 +01:00
commit 872af276ea
2 changed files with 105 additions and 11 deletions

View File

@ -10,12 +10,17 @@ import eu.kanade.tachiyomi.data.database.models.Manga;
public class ChapterRecognition { public class ChapterRecognition {
private static final Pattern p1 = Pattern.compile("ch[^0-9]?\\s*(\\d+[\\.,]?\\d*)"); private static final Pattern cleanWithToken = Pattern.compile("ch[^0-9]?\\s*(\\d+[\\.,]?\\d+)($|\\b)");
private static final Pattern p2 = Pattern.compile("(\\d+[\\.,]?\\d*)"); private static final Pattern uncleanWithToken = Pattern.compile("ch[^0-9]?\\s*(\\d+[\\.,]?\\d*)");
private static final Pattern p3 = Pattern.compile("(\\d+[\\.,]?\\d*\\s*:)"); private static final Pattern withAlphaPostfix = Pattern.compile("(\\d+[\\.,]?\\d*\\s*)([a-z])($|\\b)");
private static final Pattern cleanNumber = Pattern.compile("(\\d+[\\.,]?\\d+)($|\\b)");
private static final Pattern uncleanNumber = Pattern.compile("(\\d+[\\.,]?\\d*)");
private static final Pattern withColon = Pattern.compile("(\\d+[\\.,]?\\d*\\s*:)");
private static final Pattern pUnwanted = private static final Pattern pUnwanted =
Pattern.compile("\\b(v|ver|vol|version|volume)\\.?\\s*\\d+\\b"); Pattern.compile("(\\b|\\d)(v|ver|vol|version|volume)\\.?\\s*\\d+\\b");
private static final Pattern pPart =
Pattern.compile("(\\b|\\d)part\\s*\\d+.+");
public static void parseChapterNumber(Chapter chapter, Manga manga) { public static void parseChapterNumber(Chapter chapter, Manga manga) {
if (chapter.chapter_number != -1) if (chapter.chapter_number != -1)
@ -24,20 +29,34 @@ public class ChapterRecognition {
String name = chapter.name.toLowerCase(); String name = chapter.name.toLowerCase();
Matcher matcher; Matcher matcher;
// Safest option, the chapter has a token prepended // Safest option, the chapter has a token prepended and nothing at the end of the number
matcher = p1.matcher(name); matcher = cleanWithToken.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1));
return;
}
// a number with a single alpha prefix is parsed as sub-chapter
matcher = withAlphaPostfix.matcher(name);
if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1)) + parseAlphaPostFix(matcher.group(2));
return;
}
// the chapter has a token prepended and something at the end of the number
matcher = uncleanWithToken.matcher(name);
if (matcher.find()) { if (matcher.find()) {
chapter.chapter_number = Float.parseFloat(matcher.group(1)); chapter.chapter_number = Float.parseFloat(matcher.group(1));
return; return;
} }
// Remove anything related to the volume or version // Remove anything related to the volume or version
name = pUnwanted.matcher(name).replaceAll(""); name = pUnwanted.matcher(name).replaceAll("$1");
List<Float> occurrences; List<Float> occurrences;
// If there's only one number, use it // If there's only one number, use it
matcher = p2.matcher(name); matcher = uncleanNumber.matcher(name);
occurrences = getAllOccurrences(matcher); occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) { if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0); chapter.chapter_number = occurrences.get(0);
@ -45,7 +64,15 @@ public class ChapterRecognition {
} }
// If it has a colon, the chapter number should be that one // If it has a colon, the chapter number should be that one
matcher = p3.matcher(name); matcher = withColon.matcher(name);
occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0);
return;
}
// Prefer numbers without anything appended
matcher = cleanNumber.matcher(name);
occurrences = getAllOccurrences(matcher); occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) { if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0); chapter.chapter_number = occurrences.get(0);
@ -59,7 +86,7 @@ public class ChapterRecognition {
String mangaName = replaceIrrelevantCharacters(manga.title); String mangaName = replaceIrrelevantCharacters(manga.title);
String nameWithoutManga = difference(mangaName, name); String nameWithoutManga = difference(mangaName, name);
if (!nameWithoutManga.isEmpty()) { if (!nameWithoutManga.isEmpty()) {
matcher = p2.matcher(nameWithoutManga); matcher = uncleanNumber.matcher(nameWithoutManga);
occurrences = getAllOccurrences(matcher); occurrences = getAllOccurrences(matcher);
if (occurrences.size() == 1) { if (occurrences.size() == 1) {
chapter.chapter_number = occurrences.get(0); chapter.chapter_number = occurrences.get(0);
@ -69,6 +96,36 @@ public class ChapterRecognition {
// TODO more checks (maybe levenshtein?) // TODO more checks (maybe levenshtein?)
// try splitting the name in parts an pick the first valid one
String[] nameParts = chapter.name.split("-");
Chapter dummyChapter = Chapter.create();
if (nameParts.length > 1) {
for (String part : nameParts) {
dummyChapter.name = part;
parseChapterNumber(dummyChapter, manga);
if (dummyChapter.chapter_number >= 0) {
chapter.chapter_number = dummyChapter.chapter_number;
return;
}
}
}
// Strip anything after "part xxx" and try that
name = pPart.matcher(name).replaceAll("$1");
dummyChapter.name = name;
parseChapterNumber(dummyChapter, manga);
if (dummyChapter.chapter_number >= 0) {
chapter.chapter_number = dummyChapter.chapter_number;
return;
}
}
/**
* x.a -> x.1, x.b -> x.2, etc
*/
private static float parseAlphaPostFix(String postfix) {
char alpha = postfix.charAt(0);
return Float.parseFloat("0." + Integer.toString((int)alpha - 96));
} }
public static List<Float> getAllOccurrences(Matcher matcher) { public static List<Float> getAllOccurrences(Matcher matcher) {
@ -76,7 +133,7 @@ public class ChapterRecognition {
while (matcher.find()) { while (matcher.find()) {
// Match again to get only numbers from the captured text // Match again to get only numbers from the captured text
String text = matcher.group(); String text = matcher.group();
Matcher m = p2.matcher(text); Matcher m = uncleanNumber.matcher(text);
if (m.find()) { if (m.find()) {
try { try {
Float value = Float.parseFloat(m.group(1)); Float value = Float.parseFloat(m.group(1));

View File

@ -135,4 +135,41 @@ public class ChapterRecognitionTest {
assertThat(c.chapter_number).isEqualTo(28f); assertThat(c.chapter_number).isEqualTo(28f);
} }
@Test
public void testWithVolumeAttachedToChapter() {
Chapter c = createChapter("Ansatsu Kyoushitsu 011v002: Assembly Time");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(11f);
}
@Test
public void testWithNumberInChapterTitle() {
Chapter c = createChapter("Ansatsu Kyoushitsu 099 Present Time - 2nd Hour");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(99f);
}
@Test
public void testAlphaSubChapters() {
Chapter c = createChapter("Asu No Yoichi 19a");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(19.1f);
c = createChapter("Asu No Yoichi 19b");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(19.2f);
}
@Test
public void testChapterWithArcNumber() {
Chapter c = createChapter("Manga title 123 - Vol 016 Arc title 002");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(123f);
}
@Test
public void testChapterWithChapterPrefixAfterPart() {
Chapter c = createChapter("Tokyo ESP 027: Part 002: Chapter 001");
ChapterRecognition.parseChapterNumber(c, randomManga);
assertThat(c.chapter_number).isEqualTo(027f);
}
} }