initial commit

2019-12-23 12:42:31 +08:00
parent 30458acfd8
commit 1ac6d0bb9c
22 changed files with 2126 additions and 1 deletions
--- a/arxiv_spider.py
+++ b/arxiv_spider.py
@@ -0,0 +1,369 @@
+import requests
+import pickle
+import time
+from lib import utils
+from lib.parser import dom_node, simple_parser
+
+import socket
+import socks
+
+use_proxy = False
+if use_proxy:
+    SOCKS5_PROXY_HOST = '127.0.0.1'
+    SOCKS5_PROXY_PORT = 1080
+    default_socket = socket.socket
+    socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
+    socket.socket = socks.socksocket
+
+class arxiv_paper():
+    def __init__(self, arxiv_id = None, paper_info = None):
+        self.arxiv_id = arxiv_id
+        self.info = paper_info
+
+    def add_author(self, author):
+        self.info['authors'].append(authors)
+
+    def title(self):
+        return self.info['title']
+
+
+    def describe(self):
+        information = ''
+        information += 'ID: {0} (https://arxiv.org/abs/{0})\n'.format(self.arxiv_id)
+        for key in self.info:
+            if self.info[key] is not None:
+                info = utils.formal_text(self.info[key])
+                information += ('\t' + key + ':' + str(info) + '\n')
+        return information
+
+    def show(self):
+        print(self.describe())
+
+    def to_html(self):
+        dom_tree = dom_node(name = 'paper-section')
+        paper_title = None
+        paper_link = None
+        paper_authors = None
+        paper_comments = None
+        paper_subjects = None
+        paper_abstract = None
+        for key in self.info:
+            if self.info[key] is not None:
+                if key == 'title':
+                    paper_title = dom_node('paper-title')
+                    link_attr = {'href':'https://arxiv.org/abs/{0}'.format(self.arxiv_id)}
+                    link_node = dom_node('a', link_attr)
+                    link_node.data = self.info[key]
+                    paper_title.add_child(link_node)
+                    paper_link = dom_node('paper-pdf-link')
+                    pdf_link_attr = {'href':'https://arxiv.org/pdf/{0}'.format(self.arxiv_id)}
+                    pdf_link = dom_node('a', pdf_link_attr)
+                    pdf_link.data = '{0} | [pdf]'.format(self.arxiv_id)
+                    paper_link.add_child(pdf_link)
+
+                elif key == 'authors':
+                    paper_authors = dom_node('paper-authors')
+                    authors_string = ''
+                    for author in self.info[key]:
+                        authors_string += author + ', '
+                    authors_string = authors_string[:-2]
+                    paper_authors.data = authors_string
+
+                elif key == 'comments':
+                    paper_comments = dom_node('paper-comments')
+                    paper_comments.data = self.info[key]
+
+                elif key == 'subjects':
+                    paper_subjects = dom_node('paper-subjects')
+                    paper_subjects.data = self.info[key]
+
+                elif key == 'abstract':
+                    paper_abstract = dom_node('paper-abstract')
+                    paper_abstract.data = self.info[key]
+        dom_tree.add_child(paper_title)
+        dom_tree.add_child(paper_link)
+        dom_tree.add_child(paper_authors)
+        dom_tree.add_child(paper_abstract)
+        dom_tree.add_child(paper_comments)
+        dom_tree.add_child(paper_subjects)
+        html = dom_tree.to_string()
+        return html
+
+    def download_abstract(self, forcemode=False):
+        if not forcemode:
+            if self.info['abstract'] is not None:
+                # print('skipping download abstract since already downloaded')
+                return;
+        r = requests.get('https://arxiv.org/abs/' + self.arxiv_id)
+        parser = simple_parser()
+        parser.feed(r.text)
+        tree = parser.root
+        meta_nodes = tree.search('meta')
+        for meta_node in meta_nodes:
+            meta_attr = meta_node.attributes
+            if 'property' in meta_attr:
+                if meta_attr['property'] == 'og:description':
+                    self.info['abstract'] = utils.formal_text(meta_attr['content'])
+                    return;
+
+class arxiv_list_parser():
+    def __init__(self, html_page):
+        self.html_page = html_page
+        self.parser = simple_parser()
+        self.parser.feed(html_page)
+        self.tree = self.parser.root
+
+    def get_arxiv_id(self, dt_node):
+        if len(dt_node.childs) == 0:
+            return None
+        else:
+            arxiv_id = dt_node.childs[1].childs[0].attributes['href']
+            arxiv_id = arxiv_id.split('/')[-1]
+            return arxiv_id
+
+    def get_paper_info(self, dd_node):
+        title = None
+        authors = []
+        comments = None
+        subjects = None
+        if len(dd_node.childs) == 0:
+            return None
+        else:
+            elements = dd_node.childs[0].childs
+            for element in elements:
+                if 'class' in element.attributes:
+                    element_class = element.attributes['class']
+                    if element_class == 'list-title mathjax':
+                        title = utils.formal_text(element.data)
+                    elif element_class == 'list-authors':
+                        for child in element.childs:
+                            if child.name == 'a':
+                                authors.append(utils.formal_text(child.data))
+                    elif element_class == 'list-comments mathjax':
+                        comments = utils.formal_text(element.data)
+                    elif element_class == 'list-subjects':
+                        subjects = utils.formal_text(element.data)
+        paper_info = {
+            'title':title,
+            'authors':authors,
+            'comments':comments,
+            'subjects':subjects,
+            'abstract':None
+        }
+        return paper_info
+
+    def get_papers(self):
+        dts = self.tree.search('dt')
+        dds = self.tree.search('dd')
+        papers = []
+        for dt, dd in zip(dts, dds):
+            arxiv_id = self.get_arxiv_id(dt)
+            if arxiv_id == None:
+                continue;
+            paper_info = self.get_paper_info(dd)
+            if paper_info == None:
+                continue;
+            paper = arxiv_paper(arxiv_id, paper_info)
+            papers.append(paper)
+        return papers
+
+    def get_paper_num(self):
+        totally_paper_node = self.tree.search('small')[0].data
+        total_num_split = totally_paper_node.split(' ')
+        num_total = 0
+        for split in total_num_split:
+            if split.isdigit():
+                num_total = int(split)
+                break;
+        return num_total
+
+    def get_recent_info(self):
+        # get each day start id and day_name
+        day_name = []
+        day_start = []
+        li_nodes = self.tree.search('ul')[0].childs
+        for li in li_nodes:
+            link = li.childs[0].attributes['href']
+            start = None
+            if link.find('#item') != -1:
+                start = link.split('#')[-1][4:]
+            else:
+                start = link.split('=')[-2].split('&')[0]
+            day_name.append(li.childs[0].data)
+            day_start.append(int(start))
+        # get total paper num
+        num_total = self.get_paper_num()
+        # get each day num.
+        num_days = len(day_start)
+        day_num = []
+        for i in range(num_days):
+            if i < num_days - 1:
+                day_num.append(day_start[i+1] - day_start[i])
+            else:
+                day_num.append(num_total - day_start[i])
+
+        # generate final info.
+        recent_papers_info = {}
+        for day, start, num in zip(day_name, day_start, day_num):
+            current_day_info = {}
+            current_day_info['start'] = start
+            current_day_info['num'] = num
+            recent_papers_info[day] = current_day_info
+        return recent_papers_info
+
+class arxiv_spider():
+    def __init__(self, topic, arxiv_url = 'https://arxiv.org'):
+        self.link = arxiv_url
+        self.topic = topic
+        self.base_url = self.link + '/list/' + self.topic
+
+
+    def get_yearly_papers(self, year, log=False):
+        yearly_url = self.base_url + '/' + year
+        if log:
+            print('visiting url [{0}] for basic information'.format(yearly_url))
+        r = requests.get(yearly_url)
+        list_parser = arxiv_list_parser(r.text)
+        total_num = list_parser.get_paper_num()
+        print('Total Number for this year:', total_num)
+        yearly_url_all = yearly_url + '?skip={0}&show={1}'.format(0, total_num)
+        if log:
+            print('visiting url [{0}] for all papers'.format(yearly_url_all))
+        r = requests.get(yearly_url_all)
+        list_parser = arxiv_list_parser(r.text)
+        yearly_papers = list_parser.get_papers()
+        return yearly_papers
+
+    # papers:
+    # papers = {
+    #     'key is day string': [content is a list of arxiv_paper class]
+    # }
+
+    def get_papers_on_search_list(self, search_url, log=True):
+        if log:
+            print('visiting url [{0}] for today papers.'.format(search_url))
+        search_content = requests.get(search_url)
+        search_content = search_content.text
+        parser = simple_parser()
+        parser.feed(search_content)
+        tree = parser.root
+        paper_nodes = tree.search('entry')
+        print('num_searched_nodes:', len(paper_nodes))
+        papers = []
+        for node in paper_nodes:
+            arxiv_id = node.search('id')[0].data.split('/')[-1]
+            title = node.search('title')[0].data
+            author_nodes = node.search('name')
+            authors = [item.data for item in author_nodes]
+            category_nodes = node.search('category')
+            categories = [item.attributes['term'] for item in category_nodes]
+            subjects = ''
+            for cat in categories:
+                subjects += cat + ','
+            subjects = subjects[:-1]
+            comments_node = node.search('arxiv:comment')
+            if len(comments_node) == 0:
+                comments = ''
+            else:
+                comments = node.search('arxiv:comment')[0].data
+            abstract = node.search('summary')[0].data
+
+            title = utils.formal_text(title)
+            subjects = utils.formal_text(subjects)
+            comments = utils.formal_text(comments)
+            abstract = utils.formal_text(abstract)
+
+
+            paper_info = {
+            'title':title,
+            'authors':authors,
+            'comments':comments,
+            'subjects':subjects,
+            'abstract':abstract
+            }
+
+            paper = arxiv_paper(arxiv_id, paper_info)
+            papers.append(paper)
+        return papers
+
+    def get_papers_by_ids(self, ids, log=True):
+        num_groups = int((len(ids) + 9.1)/10)
+        if log:
+            print('spliting into {0} groups.'.format(num_groups))
+        papers = []
+        for i in range(num_groups):
+            this_batch = ids[i * 10:(i+1)*10]
+            id_list = ''
+            for paper_id in this_batch:
+                id_list += paper_id + ','
+            id_list = id_list[:-1]
+            search_url = 'http://export.arxiv.org/api/query?id_list=' + id_list
+            batch_papers = self.get_papers_on_search_list(search_url, log)
+            papers += batch_papers
+        return papers
+
+
+    def get_today_ids(self, log=True):
+        rss_url = 'http://export.arxiv.org/rss/{0}'.format(self.topic)
+        if log:
+            print('visiting url [{0}] for today papers id.'.format(rss_url))
+        rss_content = requests.get(rss_url)
+        rss_content = rss_content.text
+        parser = simple_parser()
+        parser.feed(rss_content)
+        rss = parser.root
+        id_nodes = rss.search('rdf:li')
+        paper_ids = []
+        for node in id_nodes:
+            paper_link = node.attributes['rdf:resource']
+            paper_id = paper_link.split('/')[-1]
+            paper_ids.append(paper_id)
+        print('num_paper_ids:', len(paper_ids))
+        return paper_ids
+
+    def get_today_paper(self, return_day_name=False, log=True):
+        today_ids = self.get_today_ids(log)
+        papers = self.get_papers_by_ids(today_ids)
+        print('num of papers:', len(papers))
+        return papers
+
+
+
+    def get_today_paper_backup(self, return_day_name=False):
+        papers = self.get_recent_papers(recent_days=[1])
+        today = None
+        paper = None
+        for day in papers:
+            today = day
+            paper = papers[day]
+        if return_day_name:
+            return paper, today
+        else:
+            return paper
+
+
+    def get_recent_papers(self, recent_days=[1, 2, 3, 4, 5], log=False):
+        recent_url = self.base_url + '/recent'
+        if log:
+            print('visiting url [{0}] for basic information'.format(recent_url))
+        r = requests.get(recent_url)
+        list_parser = arxiv_list_parser(r.text)
+        recent_papers_info = list_parser.get_recent_info()
+        print('paper info:', recent_papers_info)
+
+        day_id = 1
+        papers = {}
+        for day in recent_papers_info:
+            if day_id in recent_days:
+                today_start = recent_papers_info[day]['start']
+                today_num = recent_papers_info[day]['num']
+                page_url = '/pastweek?skip={0}&show={1}'.format(today_start, today_num)
+                day_url = self.base_url + page_url
+                if log:
+                    print('visiting url [{0}] for paper on day {1}'.format(day_url, day))
+                r = requests.get(day_url)
+                list_parser = arxiv_list_parser(r.text)
+                today_papers = list_parser.get_papers()
+                papers[day] = today_papers
+            day_id += 1
+        return papers