jsoupでスクレイピングしてQiita organizationsの「いいね数」ランキングを取る
なんだかQiita Orgs Rankingが止まってるので、javaでorganizationの「いいね数」を取って並べてみました。
(Qiita Orgs Rankingと違いContribution数ではありません。)
スクレイピングにはjsoupのライブラリを利用しています。
実装はだいぶ適当なので動くのは今だけだと思います。
(実行しすぎるとQiitaに怒られるかもしれない)
ソースコード
import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class QiitaOrgsRank {
static class Data {
private final String org;
private final URL url;
private final int iine;
public Data(URL url) {
this.url = url;
System.out.println("connect:" + url);
Document document;
try {
document = HttpConnection.connect(url).get();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
Elements stats = document.select(".organizationHeader_stats_value");
Elements name = document.select(".organizationHeader_profile_orgName");
org = name.get(0).text();
iine = Integer.parseInt(stats.get(1).text().trim());
}
public int getIine() {
return iine;
}
@Override
public String toString() {
return org + "\t" + iine + "\t" + url;
}
}
public static void main(String[] args) throws URISyntaxException {
Set<URL> urls = getOrgUrls();
int rank = 1;
for (Data data : (Iterable<Data>) () -> urls.stream()
.map(Data::new)
.sorted(Comparator.comparing(Data::getIine).reversed())
.iterator()) {
System.out.println(rank++ + "\t" + data.toString());
}
}
private static Set<URL> getOrgUrls() throws URISyntaxException {
Set<URL> urls = new HashSet<>();
int i = 1;
while (true) {
try {
Set<URL> orgs = getOrgUrls(i++);
if (orgs.isEmpty()) {
break;
}
urls.addAll(orgs);
} catch (IOException e) {
break;
}
}
return urls;
}
private static Set<URL> getOrgUrls(int i) throws IOException, URISyntaxException {
URL pageUrl = new URL("http://qiita.com/organizations?page=" + i);
System.out.println("connect:" + pageUrl);
Set<URL> urls = new HashSet<>();
Document document = HttpConnection.connect(pageUrl).get();
Elements elements = document.select(".organizationsList_orgName").select("a");
for (Element element : elements) {
URL url = pageUrl.toURI().resolve(element.attr("href")).toURL();
urls.add(url);
}
return urls;
}
}
結果
import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class QiitaOrgsRank {
static class Data {
private final String org;
private final URL url;
private final int iine;
public Data(URL url) {
this.url = url;
System.out.println("connect:" + url);
Document document;
try {
document = HttpConnection.connect(url).get();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
Elements stats = document.select(".organizationHeader_stats_value");
Elements name = document.select(".organizationHeader_profile_orgName");
org = name.get(0).text();
iine = Integer.parseInt(stats.get(1).text().trim());
}
public int getIine() {
return iine;
}
@Override
public String toString() {
return org + "\t" + iine + "\t" + url;
}
}
public static void main(String[] args) throws URISyntaxException {
Set<URL> urls = getOrgUrls();
int rank = 1;
for (Data data : (Iterable<Data>) () -> urls.stream()
.map(Data::new)
.sorted(Comparator.comparing(Data::getIine).reversed())
.iterator()) {
System.out.println(rank++ + "\t" + data.toString());
}
}
private static Set<URL> getOrgUrls() throws URISyntaxException {
Set<URL> urls = new HashSet<>();
int i = 1;
while (true) {
try {
Set<URL> orgs = getOrgUrls(i++);
if (orgs.isEmpty()) {
break;
}
urls.addAll(orgs);
} catch (IOException e) {
break;
}
}
return urls;
}
private static Set<URL> getOrgUrls(int i) throws IOException, URISyntaxException {
URL pageUrl = new URL("http://qiita.com/organizations?page=" + i);
System.out.println("connect:" + pageUrl);
Set<URL> urls = new HashSet<>();
Document document = HttpConnection.connect(pageUrl).get();
Elements elements = document.select(".organizationsList_orgName").select("a");
for (Element element : elements) {
URL url = pageUrl.toURI().resolve(element.attr("href")).toURL();
urls.add(url);
}
return urls;
}
}
2017/7/22現在、弊社は15位
1 TIS株式会社 49212 http://qiita.com/organizations/tis
2 Mercari 47221 http://qiita.com/organizations/mercari
3 Wantedly, Inc. 45934 http://qiita.com/organizations/wantedly
4 Increments株式会社 38725 http://qiita.com/organizations/increments
5 株式会社ソニックガーデン 37053 http://qiita.com/organizations/sonicgarden
6 株式会社Rector 32109 http://qiita.com/organizations/rector
7 株式会社ベーシック 25553 http://qiita.com/organizations/basicinc
8 株式会社トップゲート 22962 http://qiita.com/organizations/topgate
9 ShouldBee 22815 http://qiita.com/organizations/shouldbee
10 株式会社 ドワンゴ 22029 http://qiita.com/organizations/dwango
11 ピクシブ株式会社 21340 http://qiita.com/organizations/pixiv
12 Drivemode, Inc. 18300 http://qiita.com/organizations/drivemode
13 株式会社アトラエ 16562 http://qiita.com/organizations/atrae
14 freee 16294 http://qiita.com/organizations/freee
15 フューチャーアーキテクト株式会社 16168 http://qiita.com/organizations/future
あれ?Incrementsさんとこの数、だいぶ減ってない?
Author And Source
この問題について(jsoupでスクレイピングしてQiita organizationsの「いいね数」ランキングを取る), 我々は、より多くの情報をここで見つけました https://qiita.com/ota-meshi/items/5897d725dde372eac880著者帰属:元の著者の情報は、元のURLに含まれています。著作権は原作者に属する。
Content is automatically searched and collected through network algorithms . If there is a violation . Please contact us . We will adjust (correct author information ,or delete content ) as soon as possible .