#!/usr/bin/env python # -*- coding: UTF-8 -*- # # Copyright 2011 Sigma http://www.sigma.me. # # #######usage####### # $python # >>>from get_abstract_aaai import * # >>> get_allyear_abstract() # or # >>> get_allyear_pdf( ) # DBLP_conf_link is like "http://www.informatik.uni-trier.de/~ley/db/conf/aaai/" # OR just modify the main function and then # $python get_abstract_aaai.py import os import urllib import urllib2 import httplib #for https link import re import socket import locale,string import codecs title_regx=r' ([\\s\\S]+)' author_regx=r'' abstract_regx=r'Abstract
\ \
\ \([\\s\\S]*)\ \
\ \
写了个整理AAAI论文摘要的脚本
2011-12-01
这几天需要扫AAAI的摘要及论文,但是一看AAAI每年有几百篇,我就傻眼了,加之本人有个习惯,善其事之前,总是喜欢利其器,于是花了个把小时写了个把AAAI的摘要爬出来并整理成一个html文件的程序。
'
article_link_regx=r'' dblp_oneyear_regx=r'http://www.informatik.uni-trier.de/~ley/db/conf/aaai/aaai([\\s\\S]*).html' year_link_regx=r'Contents' #pdf_link_regx=r'href="([a-zA-Z0-9-\\s\\.\\/]+)" class="action" target="_parent">PDF' pdf_link_regx=r'Full Text:\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ PDF' #get_allyear_abstract("http://www.informatik.uni-trier.de/~ley/db/conf/aaai/") def get_allyear_abstract(url): page=urllib.urlopen(url).read() year_links=re.findall(year_link_regx,page) for link in year_links: expend_link=url+link print expend_link get_oneyear_abstract(expend_link) def get_oneyear_abstract(url): year=re.findall(dblp_oneyear_regx,url)[0] print year page=urllib.urlopen(url).read() links=re.findall(article_link_regx,page) articles=[] if links is not None: for link in links: url='http://www.aaai.org/ocs/index.php/AAAI/AAAI'+link.replace('view','viewPaper') article=get_aaai_abstract_from_url(url) if article is not None: articles.append(article) article_write2file(articles,year) return def get_aaai_abstract_from_url(url): page=urllib.urlopen(url).read() title_val=extractAttrValue_mask(title_regx,page) # print title_val author_val=extractAttrValue_mask(author_regx,page) # print author_val abstract_val=extractAttrValue_mask(abstract_regx,page) # print abstract_val ret_val={'title':title_val.decode('UTF-8'),'author':author_val.decode('UTF-8'),'abstract':abstract_val.decode('UTF-8'),} print ret_val return ret_val def article_write2file(articles,year): html_file_name='aaai'+year+'.html' html_head=u'\ \
AAAI '+year+u' Publication &Abstract List
\ ' html_foot=u'\ ' html_file=codecs.open(html_file_name,'w','utf-8') html_file.write(html_head) for article in articles: html_file.write(u''+article['title']+u'
\ ') html_file.write(article['author']+u'\ ') html_file.write(u'Abstract:'+article['abstract']+u'
\ ') html_file.write(html_foot) html_file.close() def extractAttrValue_mask(regx,allstring): infos = re.findall(regx,allstring) attrValue = 'no infomation in AAAI database' if(infos): attrValue = infos[0] return attrValue if __name__ == "__main__": print "main" # get_allyear_abstract('http://www.informatik.uni-trier.de/~ley/db/conf/aaai/') 实际使用中发现,貌似08年以前的论文摘要在AAAI网站上没有,并且很多论文直接用google scholar也搜不到,不过我想,只要对python熟悉些,根据这段代码,还是很容易改革能用的(前提是abstract以及pdf链接不是通过js获得的,果如此,得用一些高级的python库)。
Comments (4)
批量下载论文是不符合数据库的版权规定的。。。。。。。
@syoummer
好吧,速度把下论文部分去掉
继续学习 收藏
我没你这么高级的技术,我只是肉眼+手动把isca, hpca, micro一年的论文下下来看了一遍。