project

Step 1 Define imports¶

In [44]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from matplotlib import style
style.use('ggplot')
%matplotlib inline

In [86]:

dataset = "https://drive.google.com/file/d/1QXvjvCB4okooR-CRAvQTQH2Z89AlkJYS/view?usp=sharing"
dwn_url = 'https://drive.google.com/uc?id=1QXvjvCB4okooR-CRAvQTQH2Z89AlkJYS'
df = pd.read_csv(dwn_url)

Question¶

Show me User that has posted most number of blogs¶

In [117]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.countplot(x='creator_name', data=df)

graph.set_xticklabels(graph.get_xticklabels(),rotation=90)

plt.title("Total Hudi Blogs Posted By Users")

Out[117]:

Text(0.5, 1.0, 'Total Hudi Blogs Posted By Users')

C:\Users\s.shah\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 9410 (\N{CIRCLED LATIN CAPITAL LETTER M}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

Question¶

Which Hudi Blogs have Most Claps show me top 5 blogs¶

In [48]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.barplot(
    data=df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(),
    x="title",
    y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head()
plt.title("Top 5 Hudi Blogs on Medium which got Most number of claps ")

Out[48]:

Text(0.5, 1.0, 'Top 5 Hudi Blogs on Medium which got Most number of claps ')

Question¶

Show me Top user who has Received maxium claps¶

In [124]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.barplot(
    data=df[["title", 'clapCount','mediumUrl', "creator_name"]].sort_values(by='clapCount', ascending = False).head(),
    x="creator_name",
    y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head()
plt.title("Creator that got most number of calps for Hudi blogs ")

Out[124]:

Text(0.5, 1.0, 'Creator that got most number of calps for Hudi blogs ')

Question¶

Show me Top Collection which has recived Most Claps¶

In [50]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.barplot(
    data=df[["title", 'clapCount','mediumUrl', "creator_name", "collection_name"]].sort_values(by='clapCount', ascending = False).head(),
    x="collection_name",
    y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Top 3 Collection which has recieved Most Number of claps ")

Out[50]:

Text(0.5, 1.0, 'Top 3 Collection which has recieved Most Number of claps ')

Question¶

Show me User who has Max follower Count¶

In [57]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.barplot(
    data=df[["title", 'clapCount','mediumUrl', "creator_name", "collection_name", 'creator_socialStats_followerCount']].sort_values(by='creator_socialStats_followerCount', ascending = False).head(),
    x="creator_name",
    y="creator_socialStats_followerCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Show me User who has Max follower Count ")

Out[57]:

Text(0.5, 1.0, 'Show me User who has Max follower Count ')

Question¶

Popular keywords to use as Title¶

In [81]:

import re
from wordcloud import WordCloud, STOPWORDS

# Remove punctuation
df['text_proc'] = \
df['title'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
df['text_proc'] = \
df['title'].map(lambda x: x.lower())

# Print out the first rows of papers
df['text_proc'].head()


# Join the different processed titles together.
long_string = ','.join(list(df['text_proc'].values))


# Create a WordCloud object
wordcloud = WordCloud(background_color="black", max_words=3000, contour_width=3, 
contour_color='steelblue')# Generate a word cloud
wordcloud.generate(long_string)# Visualize the word cloud


plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

Question¶

Show me 10 Popular keywords in Apache hudi Blog Title¶

In [85]:

import pandas as pd
from collections import Counter



# combine all the text from the 'text' column into a single string
text = ' '.join(df['title'])

# convert the string to lowercase
text = text.lower()

# split the string into individual words
words = text.split()

# count the frequency of each word
word_counts = Counter(words)

# sort the Counter object by frequency
popular_keywords = word_counts.most_common()

# print the top 10 most popular keywords
for keyword in popular_keywords[:10]:
    print(keyword)

('apache', 78)
('hudi', 63)
('data', 44)
('aws', 18)
('and', 17)
('in', 17)
('to', 17)
('with', 16)
('lake', 15)
('the', 14)

Question¶

Show me Blog on Hudi which has most number of responses or comments¶

In [137]:

df[["title", 'mediumUrl', "creator_name", 'postResponses_count']].sort_values(by='postResponses_count', ascending = False).head()

Out[137]:

	title	mediumUrl	creator_name	postResponses_count
75	Origins of Data Lake at Grofers	https://lambda.blinkit.com/origins-of-data-lak...	Akshay Agarwal	5
78	Koo’s data platform — part 1: Apache Kafka and...	https://medium.com/koo-app/koos-data-platform-...	Phaneesh Gururaj	3
31	Bulk Insert Sort Modes with Apache Hudi	https://medium.com/@simpsons/bulk-insert-sort-...	Sivabalan Narayanan	3
76	Hudi vs Delta vs Iceberg Lakehouse Feature Com...	https://medium.com/apache-hudi-blogs/hudi-vs-d...	Kyle Weller	3
77	Delta vs Iceberg vs hudi : Reassessing Perform...	https://databeans-blogs.medium.com/delta-vs-ic...	DataBeans	3

In [139]:

fig, ax1 = plt.subplots(figsize=(20,10))

graph = sns.barplot(
    data=df[["title", 'mediumUrl', "creator_name", 'postResponses_count']].sort_values(by='postResponses_count', ascending = False).head(),
    x="creator_name",
    y="postResponses_count"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Show me Blog on Hudi which has most number of responses or comments ")

Out[139]:

Text(0.5, 1.0, 'Show me Blog on Hudi which has most number of responses or comments ')

Feel free to explore More if needed¶

	title	mediumUrl	creator_name	readingTime
23	How does Lake House work: using Apache Hudi as...	https://medium.com/@dsfan/how-does-lake-house-...	Gang Peng	35.634906
6	Apache Hudi — The Streaming Data Lake Platform	https://medium.com/apache-hudi-blogs/apache-hu...	Vinoth Chandar	23.282075
89	Data processing with Spark: time traveling	https://blog.devgenius.io/data-processing-with...	Petrica Leuca	14.780503
75	Origins of Data Lake at Grofers	https://lambda.blinkit.com/origins-of-data-lak...	Akshay Agarwal	14.478302
4	AWS Data Lake Solution based on Apache Hudi wi...	https://fred-gu.medium.com/aws-data-lake-solut...	Fred Gu	12.738994

Pythonist

Friday, March 24, 2023

Data Analysis for Apache Hudi Blogs on Medium with Python pandas

Step 1 Define imports¶

Question¶

Show me User that has posted most number of blogs¶

Question¶

Which Hudi Blogs have Most Claps show me top 5 blogs¶

Question¶

Show me Top user who has Received maxium claps¶

Question¶

Show me Top Collection which has recived Most Claps¶

Question¶

Top 5 Blogs with Maxium Reading Time¶

Question¶

Show me User who has Max follower Count¶

Question¶

Popular keywords to use as Title¶

Question¶

Show me 10 Popular keywords in Apache hudi Blog Title¶

Question¶

Show me Blog on Hudi which has most number of responses or comments¶

Feel free to explore More if needed¶

Getting started with LakeFS and Apache Iceberg Running Locally