TECH NINJA

July 10, 2017

BASIC WEB SCARPING FROM WIKIPEDIA IN PYTHON !

Necessary Dependencies ::

beautifulsoup4==4.5.1

lxml==3.6.4

requests==2.11.1

tabulate==0.7.5

stop-words==2015.2.23.1

Code::

from bs4 import BeautifulSoup
import requests
import re
import operator
import json
from tabulate import tabulate
import sys
from stop_words import get_stop_words

#functions
def remove_stop_words(frequency_list):
    stop_words = get_stop_words('en')

    temp_list = []
    for key,value in frequency_list:
        if key not in stop_words:
            temp_list.append([key, value])

    return temp_list

def createFrequencyTable(word_list):
    #word count
    word_count = {}
    for word in word_list:
        #index is the word
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1

    return word_count

def clean_word(word):
    cleaned_word = re.sub('[^A-Za-z]+', '', word)
    return cleaned_word

def getWordList(url):
    word_list = []
    #raw data
    source_code = requests.get(url)
    #convert to text
    plain_text = source_code.text
    #lxml format
    soup = BeautifulSoup(plain_text,'lxml')

    #find the words in paragraph tag
    for text in soup.findAll('p'):
        if text.text is None:
            continue
        #content
        content = text.text
        #lowercase and split into an array
        words = content.lower().split()

        #for each word
        for word in words:
            #remove non-chars
            cleaned_word = clean_word(word)
            #if there is still something there
            if len(cleaned_word) > 0:
                #add it to our word list
                word_list.append(cleaned_word)
    return word_list

#Getting data from the wiki

wiki_api_link = "https://en.wikipedia.org/w/api.php?format=json&action=query&list=search&srsearch="
wiki_arti_link = "https://en.wikipedia.org/wiki/"

if(len(sys.argv)<2):
print('Enter valid search word!!')
exit()

#get the search word
string_query = sys.argv[1]

#to remove stop words
if(len(sys.argv)>2):
search_mode = True
else:
search_mode = False

#create our URL
url = wiki_api_link + string_query

try:
    response = requests.get(url)
    data = json.loads(response.content.decode('utf-8'))

    #format this data
    wiki_page_tag = data['query']['search'][0]['title']

    #create our new url
    url = wiki_arti_link + wiki_page_tag
    page_word_list = getWordList(url)

    #create table of word counts
    page_word_count = createFrequencyTable(page_word_list)
    sorted_word_frequency_list = sorted(page_word_count.items(),key = operator.itemgetter(1),reverse=True)

    #remove stop words
    if(search_mode):
       sorted_word_frequency_list = remove_stop_words(sorted_word_frequency_list)

    #sum the total words to calculate the frequency
    total_words_sum = 0
    for key,value in sorted_word_frequency_list:
        total_words_sum = total_words_sum + value

    #to get the top 20 words
    if len(sorted_word_frequency_list) >20:
          sorted_word_frequency_list = sorted_word_frequency_list[:20]

    #create our final list,words + frequency + percentage
    final_list=[]
    for key,value in sorted_word_frequency_list:
        percentage_value = float(value *100) / total_words_sum
        final_list.append([key,value,round(percentage_value,4)])

    print_headers = ['Word','Frequency','Frequency Percentage']

    #print the table with tabulate
    print(tabulate(final_list, headers=print_headers, tablefmt='orgtbl'))

except requests.exceptions.Timeout:
   print("The server didn't respond. Please, try again later.")

Soon coming up with more python content , Stay Tuned ! ;)

Search This Blog

TECH NINJA

Comments

Post a Comment

Popular posts from this blog

YU YUNICORN Review!!

One Plus 3 Flash Sale!!!