写了个整理AAAI论文摘要的脚本

2011-12-01

小工具论文小工具 AAAI python

这几天需要扫AAAI的摘要及论文，但是一看AAAI每年有几百篇，我就傻眼了，加之本人有个习惯，善其事之前，总是喜欢利其器，于是花了个把小时写了个把AAAI的摘要爬出来并整理成一个html文件的程序。

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Copyright 2011 Sigma  http://www.sigma.me.
#
#
#######usage#######
# $python
# >>>from get_abstract_aaai import *
# >>> get_allyear_abstract()
# or
# >>> get_allyear_pdf()
# DBLP_conf_link is like "http://www.informatik.uni-trier.de/~ley/db/conf/aaai/"
#  OR just modify the main function and then
# $python get_abstract_aaai.py
import os
import urllib
import urllib2
import httplib #for https link
import re
import socket
import locale,string
import codecs

title_regx=r'([\\s\\S]+)'
author_regx=r'([\\s\\S]+)'
abstract_regx=r'Abstract\
\	
\
\	([\\s\\S]*)\
\	
\
\

' article_link_regx=r'
' dblp_oneyear_regx=r'http://www.informatik.uni-trier.de/~ley/db/conf/aaai/aaai([\\s\\S]*).html' year_link_regx=r'Contents' #pdf_link_regx=r'href="([a-zA-Z0-9-\\s\\.\\/]+)" class="action" target="_parent">PDF' pdf_link_regx=r'Full Text:\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ PDF' #get_allyear_abstract("http://www.informatik.uni-trier.de/~ley/db/conf/aaai/") def get_allyear_abstract(url): page=urllib.urlopen(url).read() year_links=re.findall(year_link_regx,page) for link in year_links: expend_link=url+link print expend_link get_oneyear_abstract(expend_link) def get_oneyear_abstract(url): year=re.findall(dblp_oneyear_regx,url)[0] print year page=urllib.urlopen(url).read() links=re.findall(article_link_regx,page) articles=[] if links is not None: for link in links: url='http://www.aaai.org/ocs/index.php/AAAI/AAAI'+link.replace('view','viewPaper') article=get_aaai_abstract_from_url(url) if article is not None: articles.append(article) article_write2file(articles,year) return def get_aaai_abstract_from_url(url): page=urllib.urlopen(url).read() title_val=extractAttrValue_mask(title_regx,page) # print title_val author_val=extractAttrValue_mask(author_regx,page) # print author_val abstract_val=extractAttrValue_mask(abstract_regx,page) # print abstract_val ret_val={'title':title_val.decode('UTF-8'),'author':author_val.decode('UTF-8'),'abstract':abstract_val.decode('UTF-8'),} print ret_val return ret_val def article_write2file(articles,year): html_file_name='aaai'+year+'.html' html_head=u'\ \ AAAI '+year+u' Publication & Abstract List\ \ \ \

AAAI '+year+u' Publication &Abstract List

\ ' html_foot=u'\ ' html_file=codecs.open(html_file_name,'w','utf-8') html_file.write(html_head) for article in articles: html_file.write(u'

'+article['title']+u'

\ ') html_file.write(article['author']+u'\ ') html_file.write(u'

Abstract:'+article['abstract']+u'

\ ') html_file.write(html_foot) html_file.close() def extractAttrValue_mask(regx,allstring): infos = re.findall(regx,allstring) attrValue = 'no infomation in AAAI database' if(infos): attrValue = infos[0] return attrValue if __name__ == "__main__": print "main" # get_allyear_abstract('http://www.informatik.uni-trier.de/~ley/db/conf/aaai/') 实际使用中发现，貌似08年以前的论文摘要在AAAI网站上没有，并且很多论文直接用google scholar也搜不到，不过我想，只要对python熟悉些，根据这段代码，还是很容易改革能用的（前提是abstract以及pdf链接不是通过js获得的，果如此，得用一些高级的python库）。

Comments (4)

syoummer 2011-12-03

批量下载论文是不符合数据库的版权规定的。。。。。。。

sigma 2011-12-03

@syoummer
好吧，速度把下论文部分去掉

MiaQ 2011-12-21

继续学习收藏

cyj 2011-12-30

我没你这么高级的技术，我只是肉眼+手动把isca, hpca, micro一年的论文下下来看了一遍。