[python]Gather all URLs from a certain web

2019-12-12 萧写技术

# encoding=utf8 
import sys

reload(sys) 
sys.setdefaultencoding('utf8')

import urllib
from bs4 import BeautifulSoup
import urlparse
import time
import urllib2

url = "http://log.anycle.com"
domain = "log.anycle.com"
deep = 0
tmp = ""
sites = set()
visited = set()
#local = set()

def get_local_pages(url,domain):
    global deep
    global sites
    global tmp
    repeat_time = 0
    pages = set()

    while True:
        try:
            #print "Ready to Open the web!"
            time.sleep(1)
            #print "Opening the web", url
            web = urllib2.urlopen(url=url,timeout=3)
            #print "Success to Open the web"
            break
        except:
            #print "Open Url Failed !!! Repeat"
            time.sleep(1)
            repeat_time = repeat_time+1
            if repeat_time == 5:
                return pages

    #print "Readint the web ..."
    soup = BeautifulSoup(web.read())
    #print "..."
    tags = soup.findAll(name='a')
    for tag in tags:

        try:
            ret = tag['href']
        except:
            #print "Maybe not the attr : href"
            continue
        o = urlparse.urlparse(ret)
        #print(o[0]+","+o[1]+","+o[2]+","+o[3]+","+o[4]+";")

        if o[0] is "" and o[1] is "":
            #print("Fix Page:"+ret)
            url_obj = urlparse.urlparse(web.geturl())
            #print(url_obj[0]+","+url_obj[1]+","+url_obj[2]+","+ret)
            #ret = url_obj[0] + "://" + url_obj[1] + url_obj[2] + ret
            if ret[0] == '/' :
                ret = url_obj[0] + "://" + url_obj[1] + ret
            else :
                ret = url_obj[0] + "://" + url_obj[1] + "/" + ret

            ret = ret[:8] + ret[8:].replace('//','/')
            o = urlparse.urlparse(ret)

            if '../' in o[2]:
                paths = o[2].split('/')
                for i in range(len(paths)):
                    if paths[i] == '..':
                        paths[i] = ''
                        if paths[i-1]:
                            paths[i-1] = ''
                tmp_path = ''
                for path in paths:
                    if path == '':
                        continue
                    tmp_path = tmp_path + '/' +path
                ret =ret.replace(o[2],ret_path)
            #print "FixedPage: " + ret

        if 'http' not in o[0]:
            #print("Bad Page:" + ret.encode('ascii'))
            continue

        if o[0] is "" and o[1] is not "":
            #print("Bad Page:"+ret)
            continue

        if domain not in o[1]:
            #print("Bad Page:"+ret)
            continue

        if ret.endswith(".pdf"):
            print("pdf :"+ret)
            continue
        if ret.endswith(".exe"):
            print("pdf :"+ret)
            continue
        if ret.endswith(".rar"):
            print("pdf :"+ret)
            continue
        if ret.endswith(".zip"):
            print("pdf :"+ret)
            continue
        if ret.endswith(".png"):
            print("pdf :"+ret)
            continue
        if ret.endswith(".jpg"):
            print("pdf :"+ret)
            continue


        newpage = ret
        if newpage not in sites:
            #print("Add New Page:" + newpage)
            pages.add(newpage)
    return pages

def dfs(pages):
    if pages is set():
        return
    global url
    global domain
    global sites
    global visited
    sites = set.union(sites,pages)
    for page in pages:
        if page not in visited:
            print "Visiting",page
            visited.add(page)
            url = page
            pages = get_local_pages(url, domain)
            dfs(pages)

    #print "sucess"

pages = get_local_pages(url, domain)
dfs(pages)

print("*****************************\n")
print(" dfs over, write to file now\n")

fd = open("./data.txt", "wb+")
for i in sites:
    print(i)
    fd.write(i+"\n")
fd.close()

标签: python

« 数组越界fopen时程序奔溃 | 小青柑»

发表评论：

日历
最新微语
- 有的时候，会站在分叉路口，不知道向左还是右
  2023-12-26 15:34
- 繁花乱开，鸟雀逐风。心自宁静，纷扰不闻。
  2023-03-14 09:56
- 对于不可控的事，我们保持乐观，对于可控的事情，我们保持谨慎。
  2023-02-09 11:03
- 小时候，暑假意味着无忧无虑地玩很长一段时间，节假意味着好吃好喝还有很多长期不见的小朋友来玩... 长大后，这是女儿第一个暑假，一个半月...
  2022-07-11 08:54
- Watching the autumn leaves falling as you grow older together
  2018-10-25 09:45
分类
最新评论
- Goonog
  i get it now :)
- 萧
  @Fluzak：The web host...
- Fluzak
  Nice blog here! Also...
- Albertarive
  In my opinion you co...
- ChesterHep
  What does it plan?
- ChesterHep
  No, opposite.
- mojoheadz
  Everything is OK!...
- Josephmaigh
  I just want to say t...
- ChesterHep
  What good topic
- AnthonyBub
  Certainly, never it ...
链接
标签

苗火 Nicholas

有一种人怀疑阴阳的存在，另有一种人会怀疑1+1＝2的正确性……

[python]Gather all URLs from a certain web

日历

最新微语

分类

最新评论

链接

标签