You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

247 lines
7.8 KiB
Kotlin

4 years ago
package moe.odango.index.scraper.mal
import moe.odango.index.entity.AnimeRelation
import moe.odango.index.entity.MyAnimeListInfo
import moe.odango.index.utils.IntDate
import moe.odango.index.utils.ProducerFunction
import moe.odango.index.utils.brText
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import java.time.Duration
import java.util.*
class AnimePageScraper(body: String) {
private val dom = Jsoup.parse(body)
private val items: MutableMap<String, Pair<String, Element>> = mutableMapOf()
data class Info(
val title: String,
val englishName: String?,
val japaneseName: String?,
val synonyms: List<String>,
val description: String,
val type: MyAnimeListInfo.ReleaseType,
val episodes: Int?,
val source: String?,
val image: String?,
val genres: List<Genre>,
val aired: Aired?,
val premiered: Premiered?,
val rating: MyAnimeListInfo.Rating?,
val duration: Duration?,
val related: List<Relation>,
val producers: List<ProducerRelation>
)
fun getInfo() = Info(
getTitle(),
getEnglishName(),
getJapaneseName(),
getSynonyms(),
getDescription(),
getReleaseType(),
getEpisodes(),
getSource(),
getImage(),
getGenres(),
getAired(),
getPremiered(),
getRating(),
getDuration(),
getRelated(),
getProducers()
)
fun getEpisodes(): Int? {
return items["episodes"]?.first?.toIntOrNull()
}
fun getReleaseType(): MyAnimeListInfo.ReleaseType {
return MyAnimeListInfo.ReleaseType.valueOf(items["type"]!!.first)
}
fun getTitle(): String {
return dom.select(".h1-title span[itemprop=name]").first().brText().split("\n").first()
}
fun getSynonyms(): List<String> {
// If the english name or title contains a comma we can't parse synonyms since they're splice by comma's
return if (getEnglishName()?.contains(",") == true || getTitle().contains(",")) listOf() else items["synonyms"]?.first?.split(
","
)?.map { it.trim() } ?: listOf()
}
fun getJapaneseName(): String? {
return items["japanese"]?.first
}
fun getEnglishName(): String? {
return items["english"]?.first
}
init {
val after = dom
.select(".dark_text")
.parents()
for (item in after) {
val parts = item.text().split(":", limit = 2)
val key = parts.first().trim().toLowerCase()
val value = parts.last().trim()
items[key] = value to item
}
}
fun getSource(): String? {
return items["source"]?.first
}
fun getImage(): String? {
return dom.selectFirst("img[itemprop=image]")?.attr("data-src")
}
fun getDescription(): String {
return dom.selectFirst("[itemprop=description]")?.brText() ?: ""
}
fun getAired(): Aired? {
return items["aired"]?.let {
val (from, to) = it.first.split(" to ", limit = 2)
.let { part -> part.first() to part.getOrNull(1) }
return Aired(IntDate.parse("MMM d, yyyy", from, Locale.US) ?: return null, to?.let { dateStr ->
IntDate.parse("MMM d, yyyy", dateStr, Locale.US)
})
}
}
data class Aired(val start: IntDate, val end: IntDate? = null)
fun getPremiered(): Premiered? {
val (season, year) = items["premiered"]
?.first
?.split(" ", limit = 2)
?.takeUnless { it.size != 2 }
?: return null
return Premiered(Premiered.Season.fromString(season) ?: return null, year.toIntOrNull() ?: return null)
}
data class Premiered(val season: Season, val year: Int) {
enum class Season {
Spring,
Summer,
Fall,
Winter;
companion object {
fun fromString(season: String): Season? {
return when (season.toLowerCase()) {
"spring" -> Spring
"summer" -> Summer
"fall" -> Fall
"winter" -> Winter
else -> null
}
}
}
}
}
fun getGenres(): List<Genre> {
return items["genres"]?.second?.let {
it.select("a").map { a ->
val href = a.attr("href").split("/")
Genre(href[href.indexOf("genre") + 1].toInt(), a.text().trim())
}
} ?: emptyList()
}
data class Genre(val id: Int, val name: String) {
companion object {
val COMEDY = Genre(4, "Comedy")
val DEMONS = Genre(6, "Demons")
val DRAMA = Genre(8, "Drama")
val FANTASY = Genre(10, "Fantasy")
val HENTAI = Genre(12, "Hentai")
val MAGIC = Genre(16, "Magic")
val PARODY = Genre(20, "Parody")
val ROMANCE = Genre(22, "Romance")
val SCHOOL = Genre(23, "School")
val SCIFI = Genre(24, "Sci-Fi")
val SHOUJO = Genre(25, "Shoujo")
val HAREM = Genre(35, "Harem")
val MILITARY = Genre(38, "Military")
}
}
fun getRelated(): List<Relation> {
return dom.select("table.anime_detail_related_anime tr").flatMap {
val type = it.child(0).text().trim().replace(":", "")
it.child(1).select("a").mapNotNull { el ->
el.attr("href")
?.let { href ->
val parts = href.split("/")
if (parts.contains("anime")) {
parts[parts.indexOf("anime") + 1].toLongOrNull()
} else {
null
}
}
?.let { id -> Relation(type, id) }
}
}
}
fun getProducers(): List<ProducerRelation> {
fun Pair<String, Element>?.getProducers(): List<Producer> {
return this?.second?.let {
it.select("a").mapNotNull { a ->
val href = a.attr("href").split("/")
if (href.contains("producer")) Producer(
href[href.indexOf("producer") + 1].toInt(),
a.text().trim()
) else null
}
} ?: emptyList()
}
val producers = items["producers"].getProducers()
val studios = items["studios"].getProducers()
val licensors = items["licensors"].getProducers()
return producers.map { ProducerRelation(ProducerFunction.Producer, it) } +
studios.map { ProducerRelation(ProducerFunction.Studio, it) } +
licensors.map { ProducerRelation(ProducerFunction.Licensor, it) }
}
data class ProducerRelation(val function: ProducerFunction, val producer: Producer)
data class Producer(val id: Int, val name: String)
fun getRating(): MyAnimeListInfo.Rating? {
return MyAnimeListInfo.Rating.fromString(items["rating"]?.first?.split(" - ")?.first()?.trim() ?: return null)
}
private val durationRegex = Regex("(?:(\\d+) hrs.)?(?:(\\d+) min.)?")
fun getDuration(): Duration? {
val txt = items["duration"]?.first?.split(" per ", limit = 2)?.firstOrNull() ?: return null
val grps = durationRegex.matchEntire(txt)?.groups ?: return null
val hrs = grps[1]
val min = grps[2]
var x = Duration.ofMillis(0)
x += Duration.ofHours(hrs?.value?.toInt() ?: 0)
x += Duration.ofMinutes(min?.value?.toInt() ?: 0)
return x
}
data class Relation(val type: AnimeRelation.RelationType, val id: Long) {
constructor(type: String, id: Long) : this(AnimeRelation.RelationType.fromString(type), id)
}
}