用Python获取Google的下拉框自动完成提示文本


Python #关键字提取2014-02-20 22:05

直接上代码了。

#!/usr/bin/python
#    Copyright (C) 2010 <x01110011@gmail.com> http://yige.org/python/

import urllib2, sys, getopt, time
import socket
from urllib import quote
from xml.etree.ElementTree import XMLParser
from threading import Thread

print "\n***************************"
print "* Google Suggest Scrapper *"
print "* Coded by ____________      *"
print "* x01110011@gmail.com        *"
print "***************************\n"

global i
global recursiveLevel
global outputFile
recursiveLevel = 0
outputFile = ''

def usage():
    print "Usage: GoogleSuggest.py [options] \n"
    print "       -k:     Keywords: Keywords to use (separated by #)"
    print "       -f:     File: File to read keywords from (overrides -k)"
    print "       -r:     Recursive level (0-5): Use suggested keywords to get more keywords up to -r times [Default: 0]"
    print "       -t:     Threads: Number of threads (default 5)"
    print "       -o:     Output file: Save keywords found to file\n\n"
    print "Examples:\n"
    print "       GoogleSuggest.py -k keyword1"
    print "       GoogleSuggest.py -k keyword1#keyword2#keyword3 -r 1"
    print "       GoogleSuggest.py -f keyword_file.txt -o keywords_found.txt -t 10"
    sys.exit()

class do(Thread):
    def __init__( self, keyword ):
        Thread.__init__(self)
        self.keyword = keyword
        self.result = []

    def run(self):
        global i
        tmp = []
        tmp2 = []
        for level in range(0, recursiveLevel+1):
            if level == 0:
                keyword = self.keyword.split()[0]
                tmp = self.getKeywords(keyword)
            else:
                seedkeyword = keyword
                while len(tmp):
                    keyword = tmp.pop(0)
                    if keyword == seedkeyword:
                        continue
                    else:
                        if level<recursiveLevel:
                            tmp2 = self.getKeywords(keyword)
                        else:
                            self.getKeywords(keyword)
                tmp = tmp2
        time.sleep(0)
        i = i-1

    def getKeywords(self, keyword):
        tmp = []
        url = 'http://clients1.google.com/complete/search?output=toolbar&q='+quote(keyword)
        print url
        response = urllib2.urlopen(url)
        cont = response.read()
        x = XMLParser()
        x.feed(cont)
        tree = x.close()
        for e in tree.findall('CompleteSuggestion'):
            #self.result.append([[keyword], [e.find('suggestion').get('data'), e.find('num_queries').get('int')]])
            self.result.append([[keyword], [e.find('suggestion').get('data')]])
            tmp.append(e.find('suggestion').get('data'))
        return tmp

def startThreads(keywords):
    global i
    i = 0
    threads = []
    ret = []
    while len(keywords):
        try:
            if i<th:
                keyword = keywords.pop(0)
                i = i+1
                thread = do(keyword)
                thread.start()
                threads.append(thread)
        except KeyboardInterrupt:
            print 'Suspended by user...\n'
            sys.exit()
        for t in threads:
            t.join()
        for r in t.result:
            ret.append(r)
    output(ret)

def output(ret):
    global outputFile
    output = []
    while len(ret):
        data = ret.pop(0)[1][0]
        output.append(data+'\n')
        print data
    if outputFile:
        try:
               f = open(outputFile,'w')
        except:
            print 'Can\'t open output file\n'
            sys.exit()
        f.writelines(output)
        f.close()

def run(argv):
    global th
    global recursiveLevel
    global outputFile
    th = 5
    if len(sys.argv) < 3:
        usage()
    try:
        opts, args = getopt.getopt(argv,'k:f:r:t:o:')
    except getopt.GetoptError:
        usage()
    for opt,arg in opts :
        if opt == '-k':
            inputKeywords = arg
        elif opt == '-f':
            try:
                inputFile = open(arg, "r")
            except:
                print 'Can\'t open keywords file\n'
                sys.exit()
        elif opt == '-r':
            recursiveLevel = int(arg)
        elif opt == '-t':
            th = arg
        elif opt == '-o':
            outputFile = arg
    try:
      inputFile
    except NameError:
        inputKeywords = inputKeywords.split('#')
    else:
        inputKeywords = inputFile.readlines()
    startThreads(inputKeywords)

if __name__ == "__main__":
    try:
        run(sys.argv[1:])
    except KeyboardInterrupt:
        print "Ctrl+C Exit By USER...\n"
        sys.exit()


相关文章

粤ICP备11097351号-1