Step 1 Define imports¶
In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from matplotlib import style
style.use('ggplot')
%matplotlib inline
In [86]:
dataset = "https://drive.google.com/file/d/1QXvjvCB4okooR-CRAvQTQH2Z89AlkJYS/view?usp=sharing"
dwn_url = 'https://drive.google.com/uc?id=1QXvjvCB4okooR-CRAvQTQH2Z89AlkJYS'
df = pd.read_csv(dwn_url)
In [117]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.countplot(x='creator_name', data=df)
graph.set_xticklabels(graph.get_xticklabels(),rotation=90)
plt.title("Total Hudi Blogs Posted By Users")
Out[117]:
Text(0.5, 1.0, 'Total Hudi Blogs Posted By Users')
C:\Users\s.shah\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 9410 (\N{CIRCLED LATIN CAPITAL LETTER M}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [48]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.barplot(
data=df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(),
x="title",
y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head()
plt.title("Top 5 Hudi Blogs on Medium which got Most number of claps ")
Out[48]:
Text(0.5, 1.0, 'Top 5 Hudi Blogs on Medium which got Most number of claps ')
In [124]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.barplot(
data=df[["title", 'clapCount','mediumUrl', "creator_name"]].sort_values(by='clapCount', ascending = False).head(),
x="creator_name",
y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head()
plt.title("Creator that got most number of calps for Hudi blogs ")
Out[124]:
Text(0.5, 1.0, 'Creator that got most number of calps for Hudi blogs ')
In [50]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.barplot(
data=df[["title", 'clapCount','mediumUrl', "creator_name", "collection_name"]].sort_values(by='clapCount', ascending = False).head(),
x="collection_name",
y="clapCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Top 3 Collection which has recieved Most Number of claps ")
Out[50]:
Text(0.5, 1.0, 'Top 3 Collection which has recieved Most Number of claps ')
In [51]:
data=df[["title", 'mediumUrl', "creator_name",'readingTime']].sort_values(by='readingTime', ascending = False).head()
data
Out[51]:
title | mediumUrl | creator_name | readingTime | |
---|---|---|---|---|
23 | How does Lake House work: using Apache Hudi as... | https://medium.com/@dsfan/how-does-lake-house-... | Gang Peng | 35.634906 |
6 | Apache Hudi — The Streaming Data Lake Platform | https://medium.com/apache-hudi-blogs/apache-hu... | Vinoth Chandar | 23.282075 |
89 | Data processing with Spark: time traveling | https://blog.devgenius.io/data-processing-with... | Petrica Leuca | 14.780503 |
75 | Origins of Data Lake at Grofers | https://lambda.blinkit.com/origins-of-data-lak... | Akshay Agarwal | 14.478302 |
4 | AWS Data Lake Solution based on Apache Hudi wi... | https://fred-gu.medium.com/aws-data-lake-solut... | Fred Gu | 12.738994 |
In [52]:
for x in data['mediumUrl'].to_list(): print(x)
https://medium.com/@dsfan/how-does-lake-house-work-using-apache-hudi-as-an-example-18ec196e6626 https://medium.com/apache-hudi-blogs/apache-hudi-the-streaming-data-lake-platform-5964468678a4 https://blog.devgenius.io/data-processing-with-spark-time-traveling-55905f765694 https://lambda.blinkit.com/origins-of-data-lake-at-grofers-6c011f94b86c https://fred-gu.medium.com/aws-data-lake-solution-based-on-apache-hudi-without-requiring-database-cdc-999d2e5417e
In [57]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.barplot(
data=df[["title", 'clapCount','mediumUrl', "creator_name", "collection_name", 'creator_socialStats_followerCount']].sort_values(by='creator_socialStats_followerCount', ascending = False).head(),
x="creator_name",
y="creator_socialStats_followerCount"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Show me User who has Max follower Count ")
Out[57]:
Text(0.5, 1.0, 'Show me User who has Max follower Count ')
In [81]:
import re
from wordcloud import WordCloud, STOPWORDS
# Remove punctuation
df['text_proc'] = \
df['title'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
df['text_proc'] = \
df['title'].map(lambda x: x.lower())
# Print out the first rows of papers
df['text_proc'].head()
# Join the different processed titles together.
long_string = ','.join(list(df['text_proc'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="black", max_words=3000, contour_width=3,
contour_color='steelblue')# Generate a word cloud
wordcloud.generate(long_string)# Visualize the word cloud
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()
In [85]:
import pandas as pd
from collections import Counter
# combine all the text from the 'text' column into a single string
text = ' '.join(df['title'])
# convert the string to lowercase
text = text.lower()
# split the string into individual words
words = text.split()
# count the frequency of each word
word_counts = Counter(words)
# sort the Counter object by frequency
popular_keywords = word_counts.most_common()
# print the top 10 most popular keywords
for keyword in popular_keywords[:10]:
print(keyword)
('apache', 78) ('hudi', 63) ('data', 44) ('aws', 18) ('and', 17) ('in', 17) ('to', 17) ('with', 16) ('lake', 15) ('the', 14)
In [137]:
df[["title", 'mediumUrl', "creator_name", 'postResponses_count']].sort_values(by='postResponses_count', ascending = False).head()
Out[137]:
title | mediumUrl | creator_name | postResponses_count | |
---|---|---|---|---|
75 | Origins of Data Lake at Grofers | https://lambda.blinkit.com/origins-of-data-lak... | Akshay Agarwal | 5 |
78 | Koo’s data platform — part 1: Apache Kafka and... | https://medium.com/koo-app/koos-data-platform-... | Phaneesh Gururaj | 3 |
31 | Bulk Insert Sort Modes with Apache Hudi | https://medium.com/@simpsons/bulk-insert-sort-... | Sivabalan Narayanan | 3 |
76 | Hudi vs Delta vs Iceberg Lakehouse Feature Com... | https://medium.com/apache-hudi-blogs/hudi-vs-d... | Kyle Weller | 3 |
77 | Delta vs Iceberg vs hudi : Reassessing Perform... | https://databeans-blogs.medium.com/delta-vs-ic... | DataBeans | 3 |
In [139]:
fig, ax1 = plt.subplots(figsize=(20,10))
graph = sns.barplot(
data=df[["title", 'mediumUrl', "creator_name", 'postResponses_count']].sort_values(by='postResponses_count', ascending = False).head(),
x="creator_name",
y="postResponses_count"
)
df[["title", 'clapCount','mediumUrl']].sort_values(by='clapCount', ascending = False).head(3)
plt.title("Show me Blog on Hudi which has most number of responses or comments ")
Out[139]:
Text(0.5, 1.0, 'Show me Blog on Hudi which has most number of responses or comments ')