package moe.odango.index.scraper.mal import moe.odango.index.entity.AnimeRelation import moe.odango.index.entity.MyAnimeListInfo import moe.odango.index.utils.IntDate import moe.odango.index.utils.ProducerFunction import moe.odango.index.utils.brText import org.jsoup.Jsoup import org.jsoup.nodes.Element import java.time.Duration import java.util.* class AnimePageScraper(body: String) { private val dom = Jsoup.parse(body) private val items: MutableMap> = mutableMapOf() data class Info( val title: String, val englishName: String?, val japaneseName: String?, val synonyms: List, val description: String, val type: MyAnimeListInfo.ReleaseType, val episodes: Int?, val source: String?, val image: String?, val genres: List, val aired: Aired?, val premiered: Premiered?, val rating: MyAnimeListInfo.Rating?, val duration: Duration?, val related: List, val producers: List ) fun getInfo() = Info( getTitle(), getEnglishName(), getJapaneseName(), getSynonyms(), getDescription(), getReleaseType(), getEpisodes(), getSource(), getImage(), getGenres(), getAired(), getPremiered(), getRating(), getDuration(), getRelated(), getProducers() ) fun getEpisodes(): Int? { return items["episodes"]?.first?.toIntOrNull() } fun getReleaseType(): MyAnimeListInfo.ReleaseType { return MyAnimeListInfo.ReleaseType.valueOf(items["type"]!!.first) } fun getTitle(): String { return dom.select(".h1-title span[itemprop=name]").first().brText().split("\n").first() } fun getSynonyms(): List { // If the english name or title contains a comma we can't parse synonyms since they're splice by comma's return if (getEnglishName()?.contains(",") == true || getTitle().contains(",")) listOf() else items["synonyms"]?.first?.split( "," )?.map { it.trim() } ?: listOf() } fun getJapaneseName(): String? { return items["japanese"]?.first } fun getEnglishName(): String? { return items["english"]?.first } init { val after = dom .select(".dark_text") .parents() for (item in after) { val parts = item.text().split(":", limit = 2) val key = parts.first().trim().toLowerCase() val value = parts.last().trim() items[key] = value to item } } fun getSource(): String? { return items["source"]?.first } fun getImage(): String? { return dom.selectFirst("img[itemprop=image]")?.attr("data-src") } fun getDescription(): String { return dom.selectFirst("[itemprop=description]")?.brText() ?: "" } fun getAired(): Aired? { return items["aired"]?.let { val (from, to) = it.first.split(" to ", limit = 2) .let { part -> part.first() to part.getOrNull(1) } return Aired(IntDate.parse("MMM d, yyyy", from, Locale.US) ?: return null, to?.let { dateStr -> IntDate.parse("MMM d, yyyy", dateStr, Locale.US) }) } } data class Aired(val start: IntDate, val end: IntDate? = null) fun getPremiered(): Premiered? { val (season, year) = items["premiered"] ?.first ?.split(" ", limit = 2) ?.takeUnless { it.size != 2 } ?: return null return Premiered(Premiered.Season.fromString(season) ?: return null, year.toIntOrNull() ?: return null) } data class Premiered(val season: Season, val year: Int) { enum class Season { Spring, Summer, Fall, Winter; companion object { fun fromString(season: String): Season? { return when (season.toLowerCase()) { "spring" -> Spring "summer" -> Summer "fall" -> Fall "winter" -> Winter else -> null } } } } } fun getGenres(): List { return items["genres"]?.second?.let { it.select("a").map { a -> val href = a.attr("href").split("/") Genre(href[href.indexOf("genre") + 1].toInt(), a.text().trim()) } } ?: emptyList() } data class Genre(val id: Int, val name: String) { companion object { val COMEDY = Genre(4, "Comedy") val DEMONS = Genre(6, "Demons") val DRAMA = Genre(8, "Drama") val FANTASY = Genre(10, "Fantasy") val HENTAI = Genre(12, "Hentai") val MAGIC = Genre(16, "Magic") val PARODY = Genre(20, "Parody") val ROMANCE = Genre(22, "Romance") val SCHOOL = Genre(23, "School") val SCIFI = Genre(24, "Sci-Fi") val SHOUJO = Genre(25, "Shoujo") val HAREM = Genre(35, "Harem") val MILITARY = Genre(38, "Military") } } fun getRelated(): List { return dom.select("table.anime_detail_related_anime tr").flatMap { val type = it.child(0).text().trim().replace(":", "") it.child(1).select("a").mapNotNull { el -> el.attr("href") ?.let { href -> val parts = href.split("/") if (parts.contains("anime")) { parts[parts.indexOf("anime") + 1].toLongOrNull() } else { null } } ?.let { id -> Relation(type, id) } } } } fun getProducers(): List { fun Pair?.getProducers(): List { return this?.second?.let { it.select("a").mapNotNull { a -> val href = a.attr("href").split("/") if (href.contains("producer")) Producer( href[href.indexOf("producer") + 1].toInt(), a.text().trim() ) else null } } ?: emptyList() } val producers = items["producers"].getProducers() val studios = items["studios"].getProducers() val licensors = items["licensors"].getProducers() return producers.map { ProducerRelation(ProducerFunction.Producer, it) } + studios.map { ProducerRelation(ProducerFunction.Studio, it) } + licensors.map { ProducerRelation(ProducerFunction.Licensor, it) } } data class ProducerRelation(val function: ProducerFunction, val producer: Producer) data class Producer(val id: Int, val name: String) fun getRating(): MyAnimeListInfo.Rating? { return MyAnimeListInfo.Rating.fromString(items["rating"]?.first?.split(" - ")?.first()?.trim() ?: return null) } private val durationRegex = Regex("(?:(\\d+) hrs.)?(?:(\\d+) min.)?") fun getDuration(): Duration? { val txt = items["duration"]?.first?.split(" per ", limit = 2)?.firstOrNull() ?: return null val grps = durationRegex.matchEntire(txt)?.groups ?: return null val hrs = grps[1] val min = grps[2] var x = Duration.ofMillis(0) x += Duration.ofHours(hrs?.value?.toInt() ?: 0) x += Duration.ofMinutes(min?.value?.toInt() ?: 0) return x } data class Relation(val type: AnimeRelation.RelationType, val id: Long) { constructor(type: String, id: Long) : this(AnimeRelation.RelationType.fromString(type), id) } }