源网页:中国统计局标准 http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/
打开网页后可以分析出行政区域划分共分为5层
根据传入参数,生成网页地址时需要1-3层的只传本身以及 4层及以后的增加当前省份的前缀。
#生成实际需要解析的页面地址 def geturl(level,url,code):if level<4:url=urlelse:url=code[0:2]+'/'+urlurl='http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'+urlreturn url
标签1-5分别不同
#获取需要解析的标签 def getlevelclass(level):LevelDict={1:"provincetr",2:"citytr",3:"countytr",4:"towntr",5:"villagetr"}return LevelDict[level]
根据网页上的标签以及实际地址去获取所需要的网页内容
#设置头信息 def getheaders():headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}return headers #获取网页内容 def Get_WebContent(url,level):headers=getheaders()res=urllib.request.Request(url,None, headers=headers)i=1while i<4:try:response=urllib.request.urlopen(res)i=100except error.URLError as e: print('执行第'+str(i)+'次错误,'+e.reason)i=i+1time.sleep(10)html_content=response.read()#将html从gb2312编码转换到utf-8的编码html_content = html_content.decode('gb2312', 'ignore').encode('utf-8')soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')#获得内容levelclass='.'+getlevelclass(level)souplist=soup.select(levelclass)return souplist
根据输出值需要创建解析第一层与其他层的两种函数。
#conding=utf-8 from bs4 import BeautifulSoup#用于解析第二层,内容(Code,Pronvince,urls) def Get_Child(souplist,parentid,level):SQLLIST=[]for provincesoup in souplist:url_tds=provincesoup.find_all('a',href=True)a=1for td in url_tds:if a%2==1:code=td.get_text()urls=td['href']else:provience=td.get_text()row=(code,provience,parentid,level,urls)SQLLIST.append(row) a=a+1 return SQLLIST#用于解析第一层,内容(Pronvince,urls),Code=urls中的数字部分 def Get_Main(souplist,parentid,level):SQLLIST=[]for provincesoup in souplist:url_tds=provincesoup.find_all('a',href=True)for td in url_tds:provience=td.get_text()urls=td['href']code=td['href'].replace('.html', '')row=(code,provience,parentid,level,urls)SQLLIST.append(row)return SQLLIST#实际调用的获取值函数函数 def GetDetail(souplist,level,parentid):if level==1:SQLLIST=Get_Main(souplist,parentid,level)else:SQLLIST=Get_Child(souplist,parentid,level)return SQLLIST
SQLSERVER表
CREATE TABLE [dbo].[China_Position]([ID] [int] IDENTITY(0,1) NOT NULL,[Code] [nvarchar](20) NULL,[Name] [nvarchar](40) NULL,[Name_Short] [nvarchar](20) NULL,[ParentID] [int] NULL,[Level] [int] NULL,[Urls] [nvarchar](200) NULL,[IsFinish] [smallint] NOT NULL )
def DataInsert(ValueList):SQLStr="INSERT INTO [dbo].[China_Position]([Code] ,[Name] ,[ParentID],[Level] ,[Urls]) VALUES(%s ,%s ,%d,%d,%s) "SqlInsert(SQLStr,ValueList)#获取待运行的任务 def GetTaskList(level):SQLStr="""SELECT v1.[ID],v1.[Level]+1 as [Level],v1.[Urls],v1.CodeFROM [dbo].[China_Position] v1 with(nolock)where [IsFinish]=0 And Level=""" + str(level-1)cur=SqlSelect(SQLStr)TaskList=[]for row in cur:rows=(row[0],row[1],row[2],row[3])TaskList.append(rows)return TaskList
#记录执行成功日志 def RecordLog(ID):SQLStr="update [dbo].[China_Position] set IsFinish=1 where ID="+str(ID)SqlDelete(SQLStr)
执行最终的代码,获取level1-3层的数据。
for i in range(1,4):#获取第几层的待执行任务TaskList=GetTaskList(i)for CTask in TaskList:parentid=CTask[0]level=CTask[1]url=CTask[2]Code=CTask[3]#获取真实的网页url=geturl(level,url,Code)#获取网页内容souplist=Get_WebContent(url,level)#待插入数据路的列表ValueList=GetDetail(souplist,level,parentid)#插入数据库 DataInsert(ValueList)#记录成功日志,下次执行时不执行已执行的任务RecordLog(parentid)