You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 lines
7.8 KiB
Kotlin

package moe.odango.index.scraper.mal
import moe.odango.index.entity.AnimeRelation
import moe.odango.index.entity.MyAnimeListInfo
import moe.odango.index.utils.IntDate
import moe.odango.index.utils.ProducerFunction
import moe.odango.index.utils.brText
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import java.time.Duration
import java.util.*
class AnimePageScraper(body: String) {
private val dom = Jsoup.parse(body)
private val items: MutableMap<String, Pair<String, Element>> = mutableMapOf()
data class Info(
val title: String,
val englishName: String?,
val japaneseName: String?,
val synonyms: List<String>,
val description: String,
val type: MyAnimeListInfo.ReleaseType,
val episodes: Int?,
val source: String?,
val image: String?,
val genres: List<Genre>,
val aired: Aired?,
val premiered: Premiered?,
val rating: MyAnimeListInfo.Rating?,
val duration: Duration?,
val related: List<Relation>,
val producers: List<ProducerRelation>
)
fun getInfo() = Info(
getTitle(),
getEnglishName(),
getJapaneseName(),
getSynonyms(),
getDescription(),
getReleaseType(),
getEpisodes(),
getSource(),
getImage(),
getGenres(),
getAired(),
getPremiered(),
getRating(),
getDuration(),
getRelated(),
getProducers()
)
fun getEpisodes(): Int? {
return items["episodes"]?.first?.toIntOrNull()
}
fun getReleaseType(): MyAnimeListInfo.ReleaseType {
return MyAnimeListInfo.ReleaseType.valueOf(items["type"]!!.first)
}
fun getTitle(): String {
return dom.select(".h1-title span[itemprop=name]").first().brText().split("\n").first()
}
fun getSynonyms(): List<String> {
// If the english name or title contains a comma we can't parse synonyms since they're splice by comma's
return if (getEnglishName()?.contains(",") == true || getTitle().contains(",")) listOf() else items["synonyms"]?.first?.split(
","
)?.map { it.trim() } ?: listOf()
}
fun getJapaneseName(): String? {
return items["japanese"]?.first
}
fun getEnglishName(): String? {
return items["english"]?.first
}
init {
val after = dom
.select(".dark_text")
.parents()
for (item in after) {
val parts = item.text().split(":", limit = 2)
val key = parts.first().trim().toLowerCase()
val value = parts.last().trim()
items[key] = value to item
}
}
fun getSource(): String? {
return items["source"]?.first
}
fun getImage(): String? {
return dom.selectFirst("img[itemprop=image]")?.attr("data-src")
}
fun getDescription(): String {
return dom.selectFirst("[itemprop=description]")?.brText() ?: ""
}
fun getAired(): Aired? {
return items["aired"]?.let {
val (from, to) = it.first.split(" to ", limit = 2)
.let { part -> part.first() to part.getOrNull(1) }
return Aired(IntDate.parse("MMM d, yyyy", from, Locale.US) ?: return null, to?.let { dateStr ->
IntDate.parse("MMM d, yyyy", dateStr, Locale.US)
})
}
}
data class Aired(val start: IntDate, val end: IntDate? = null)
fun getPremiered(): Premiered? {
val (season, year) = items["premiered"]
?.first
?.split(" ", limit = 2)
?.takeUnless { it.size != 2 }
?: return null
return Premiered(Premiered.Season.fromString(season) ?: return null, year.toIntOrNull() ?: return null)
}
data class Premiered(val season: Season, val year: Int) {
enum class Season {
Spring,
Summer,
Fall,
Winter;
companion object {
fun fromString(season: String): Season? {
return when (season.toLowerCase()) {
"spring" -> Spring
"summer" -> Summer
"fall" -> Fall
"winter" -> Winter
else -> null
}
}
}
}
}
fun getGenres(): List<Genre> {
return items["genres"]?.second?.let {
it.select("a").map { a ->
val href = a.attr("href").split("/")
Genre(href[href.indexOf("genre") + 1].toInt(), a.text().trim())
}
} ?: emptyList()
}
data class Genre(val id: Int, val name: String) {
companion object {
val COMEDY = Genre(4, "Comedy")
val DEMONS = Genre(6, "Demons")
val DRAMA = Genre(8, "Drama")
val FANTASY = Genre(10, "Fantasy")
val HENTAI = Genre(12, "Hentai")
val MAGIC = Genre(16, "Magic")
val PARODY = Genre(20, "Parody")
val ROMANCE = Genre(22, "Romance")
val SCHOOL = Genre(23, "School")
val SCIFI = Genre(24, "Sci-Fi")
val SHOUJO = Genre(25, "Shoujo")
val HAREM = Genre(35, "Harem")
val MILITARY = Genre(38, "Military")
}
}
fun getRelated(): List<Relation> {
return dom.select("table.anime_detail_related_anime tr").flatMap {
val type = it.child(0).text().trim().replace(":", "")
it.child(1).select("a").mapNotNull { el ->
el.attr("href")
?.let { href ->
val parts = href.split("/")
if (parts.contains("anime")) {
parts[parts.indexOf("anime") + 1].toLongOrNull()
} else {
null
}
}
?.let { id -> Relation(type, id) }
}
}
}
fun getProducers(): List<ProducerRelation> {
fun Pair<String, Element>?.getProducers(): List<Producer> {
return this?.second?.let {
it.select("a").mapNotNull { a ->
val href = a.attr("href").split("/")
if (href.contains("producer")) Producer(
href[href.indexOf("producer") + 1].toInt(),
a.text().trim()
) else null
}
} ?: emptyList()
}
val producers = items["producers"].getProducers()
val studios = items["studios"].getProducers()
val licensors = items["licensors"].getProducers()
return producers.map { ProducerRelation(ProducerFunction.Producer, it) } +
studios.map { ProducerRelation(ProducerFunction.Studio, it) } +
licensors.map { ProducerRelation(ProducerFunction.Licensor, it) }
}
data class ProducerRelation(val function: ProducerFunction, val producer: Producer)
data class Producer(val id: Int, val name: String)
fun getRating(): MyAnimeListInfo.Rating? {
return MyAnimeListInfo.Rating.fromString(items["rating"]?.first?.split(" - ")?.first()?.trim() ?: return null)
}
private val durationRegex = Regex("(?:(\\d+) hrs.)?(?:(\\d+) min.)?")
fun getDuration(): Duration? {
val txt = items["duration"]?.first?.split(" per ", limit = 2)?.firstOrNull() ?: return null
val grps = durationRegex.matchEntire(txt)?.groups ?: return null
val hrs = grps[1]
val min = grps[2]
var x = Duration.ofMillis(0)
x += Duration.ofHours(hrs?.value?.toInt() ?: 0)
x += Duration.ofMinutes(min?.value?.toInt() ?: 0)
return x
}
data class Relation(val type: AnimeRelation.RelationType, val id: Long) {
constructor(type: String, id: Long) : this(AnimeRelation.RelationType.fromString(type), id)
}
}