import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6, Category10
from bokeh.io import output_notebook, reset_output
import string
import re
output_notebook()
df = pd.read_csv('ds_jobs.csv')
df.head()
id | title | company | announcement | description | |
---|---|---|---|---|---|
0 | 0 | Senior Analyst, Data Science and Analytics | TransUnion | The Muse | TransUnion's Job Applicant Privacy Notice Wha... |
1 | 1 | Senior Data Scientist | Grubhub Holdings, Inc. | ZipRecruiter | About The Opportunity We're all about connect... |
2 | 2 | Lead Data Science Analyst | Discover Financial Services | Discover. A brighter future. With us, you’ll ... | |
3 | 3 | Data Science Intern | AbelsonTaylor | Startup Jobs | Are you a 2023 college graduate or rising coll... |
4 | 4 | Data Scientist | NORC at the University of Chicago | SimplyHired | JOB DESCRIPTION: At NORC, Data Scientists pla... |
df.describe(include='all')
id | title | company | announcement | description | |
---|---|---|---|---|---|
count | 790.000000 | 790 | 790 | 790 | 790 |
unique | NaN | 515 | 665 | 203 | 790 |
top | NaN | Data Scientist | Upwork | TransUnion's Job Applicant Privacy Notice Wha... | |
freq | NaN | 80 | 13 | 189 | 1 |
mean | 394.500000 | NaN | NaN | NaN | NaN |
std | 228.197648 | NaN | NaN | NaN | NaN |
min | 0.000000 | NaN | NaN | NaN | NaN |
25% | 197.250000 | NaN | NaN | NaN | NaN |
50% | 394.500000 | NaN | NaN | NaN | NaN |
75% | 591.750000 | NaN | NaN | NaN | NaN |
max | 789.000000 | NaN | NaN | NaN | NaN |
df.isna().sum()
id 0 title 0 company 0 announcement 0 description 0 dtype: int64
df['announcement'].value_counts()[:20]
LinkedIn 189 SimplyHired 79 ZipRecruiter 66 Salary.com 41 Startup Jobs 20 Adzuna 20 Glassdoor 20 Greenhouse 14 Upwork 13 Built In 13 Clearance Jobs 10 SmartRecruiters Job Search 9 AngelList 7 The Muse 7 Lever 6 Mendeley 6 WayUp 5 Monster 5 Nexxt 5 Dice 5 Name: announcement, dtype: int64
# Extract job announcements
job_announcements = df['announcement'].value_counts().rename_axis('announcement').reset_index(name='counts')
source = ColumnDataSource(data=job_announcements)
# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of job announcement platforms')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
xlabel_ticks[t] = job_announcements['announcement'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Platforms'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None
# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
df['company'].value_counts()
Upwork 13 Walmart 12 Dice 10 Booz Allen Hamilton 10 Cardinal Health 6 .. InfiCare Software Technologies Pvt Ltd. 1 Faire 1 Apex Systems 1 Bluestem Brands 1 Hicuity Health 1 Name: company, Length: 665, dtype: int64
# Extract companies
companies = df['company'].value_counts().rename_axis('company').reset_index(name='counts')
source = ColumnDataSource(data=companies)
# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of hiring companies')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200, len(companies) - 1]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
xlabel_ticks[t] = companies['company'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Companies'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None
# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
# Define a function to clean job titles
def clean_job_title(job_title):
job_title = job_title.lower() # convert to lowercase
job_title = re.sub('[^a-zA-Z0-9\n\.]', ' ', job_title) # remove special characters
job_title = re.sub('\s+', ' ', job_title) # remove extra whitespaces
job_title = job_title.strip() # remove leading/trailing whitespaces
return job_title
# Apply the clean_job_title function to the job title column
df['job_title'] = df['title'].apply(clean_job_title)
# Get the value counts of each job title
job_title_counts = df['job_title'].value_counts()
# Print the top 10 most common job titles
print(job_title_counts)
data scientist 81 data analyst 71 senior data scientist 29 senior data analyst 22 business data analyst 13 .. data analyst monitoring 1 vp data science 1 senior data scientist game security 1 senior data scientist remote phd required 1 cost controller data analyst 1 Name: job_title, Length: 507, dtype: int64
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams
# Extract job titles
job_titles = df['title'].tolist()
# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_titles = []
for title in job_titles:
# Convert to lowercase
title = title.lower()
# Remove punctuation and special characters
title = re.sub(r'[^\w\s]', '', title)
# Tokenize title
title_tokens = word_tokenize(title)
# Remove stop words
title_tokens = [token for token in title_tokens if token not in stop_words]
tokens.extend(title_tokens)
processed_titles.append(title_tokens)
# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))
# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)
# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job Titles')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='red', source=source, line_width=2)
# p.vbar(x='words', top='frequency', line_color='navy', source=source, width=0.9)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'
# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
from wordcloud import WordCloud
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)
# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file('top20-wordcloud.png')
# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for title in processed_titles:
bigrams.extend(list(ngrams(title, 2)))
trigrams.extend(list(ngrams(title, 3)))
# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)
# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
print(bigram, frequency)
# Print the top 10 most common trigrams
print('\nTop 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
print(trigram, frequency)
Top 10 most common bigrams: ('data', 'analyst') 341 ('data', 'scientist') 277 ('senior', 'data') 108 ('data', 'science') 43 ('data', 'analytics') 35 ('sr', 'data') 27 ('business', 'data') 25 ('data', 'specialist') 23 ('lead', 'data') 20 ('data', 'analysis') 16 Top 10 most common trigrams: ('senior', 'data', 'scientist') 58 ('senior', 'data', 'analyst') 42 ('business', 'data', 'analyst') 20 ('sr', 'data', 'scientist') 16 ('sr', 'data', 'analyst') 11 ('staff', 'data', 'scientist') 9 ('lead', 'data', 'scientist') 9 ('lead', 'data', 'analyst') 9 ('entry', 'level', 'data') 8 ('level', 'data', 'analyst') 8
bigrams_freqs = {}
for bg in bigram_fdist.items():
# print(a)
bigrams_freqs[bg[0][0] + ' ' + bg[0][1]] = bg[1]
from wordcloud import WordCloud
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(bigrams_freqs)
# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file('bigrams-wordcloud.png')
trigrams_freqs = {}
for tg in trigram_fdist.items():
trigrams_freqs[tg[0][0] + ' ' + tg[0][1] + ' ' + tg[0][2]] = tg[1]
from wordcloud import WordCloud
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(trigrams_freqs)
# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file('trigrams-wordcloud.png')
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Blues
# Extract the top 10 most common bigrams and trigrams
top_bigrams = bigram_fdist.most_common(10)
top_trigrams = trigram_fdist.most_common(10)
# Create a list of the bigram and trigram labels
bigram_labels = [', '.join(bigram) for bigram, _ in top_bigrams]
trigram_labels = [', '.join(trigram) for trigram, _ in top_trigrams]
# Create a list of the bigram and trigram frequencies
bigram_frequencies = [frequency for _, frequency in top_bigrams]
trigram_frequencies = [frequency for _, frequency in top_trigrams]
# Create a ColumnDataSource for the bigrams and trigrams
bigram_source = ColumnDataSource(data=dict(labels=bigram_labels, frequencies=bigram_frequencies))
trigram_source = ColumnDataSource(data=dict(labels=trigram_labels, frequencies=trigram_frequencies))
# Define a color map based on the height of the bars
# color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=-100, high=max(bigram_frequencies))
color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=min(bigram_frequencies), high=max(bigram_frequencies))
# Create a figure for the bigrams
bigram_plot = figure(y_range=bigram_labels, height=400, width=600,
title='Top 10 most common bigrams in job titles')
bigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=bigram_source,
line_color='white')
bigram_plot.x_range.start = 0
bigram_plot.ygrid.grid_line_color = None
# Create a figure for the trigrams
trigram_plot = figure(y_range=trigram_labels, height=400, width=600,
title='Top 10 most common trigrams in job titles')
trigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=trigram_source,
line_color='white')
trigram_plot.x_range.start = 0
trigram_plot.ygrid.grid_line_color = None
# Show the plots
show(bigram_plot)
show(trigram_plot)
# Define a function to identify seniority level based on keywords in the job title
def get_seniority_level(title):
senior_keywords = ['senior', 'lead', 'principal', 'vp', 'director', 'staff', 'manager']
for keyword in senior_keywords:
if keyword in title.lower():
return 'Senior'
return 'Other'
# Apply the get_seniority_level function to the job title column and create a new seniority_level column
df['seniority_level'] = df['title'].apply(get_seniority_level)
# Group the data by seniority level and count the number of job titles in each group
grouped_df = df.groupby('seniority_level')['title'].count().reset_index(name='count')
# Create a Bokeh ColumnDataSource object for the bar chart
source = ColumnDataSource(grouped_df)
# Create the bar chart with Bokeh
p = figure(x_range=grouped_df['seniority_level'], height=300, width=400,
title='Distribution of Job Titles by Seniority Level')
p.vbar(x='seniority_level', top='count', width=0.9, source=source)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.axis_label = 'Seniority Level'
p.yaxis.axis_label = 'Number of Job Titles'
show(p)
years of experience
from job descriptions¶# Apply the clean_job_title function to the job title column
df['description'] = df['description'].apply(clean_job_title)
# Define the regex pattern to extract years of experience from the job description
exp_regex = r'(\d+)\+? year[s]? ?(?:of )?experience'
# Extract years of experience from the job description using the regex pattern
df['years_of_experience'] = df['description'].str.extract(exp_regex)
df['years_of_experience'] = df['years_of_experience'].fillna(0).astype(int)
# Create a Bokeh ColumnDataSource object for the scatter plot
source = ColumnDataSource(df[['id', 'years_of_experience']])
# Create the bar chart with Bokeh
p = figure( height=300, width=600,
title='Distribution of titles by years of experience')
p.circle(x='id', y='years_of_experience', size=5, source=source)
show(p)
from bokeh.models import HoverTool
# Calculate length of job titles
df['title_length'] = df['title'].apply(lambda x: len(x.split()))
# Create a histogram of title lengths
title_lengths = df['title_length'].tolist()
hist, edges = np.histogram(df['title_length'])
# Shift to center the tick labels
edges = edges - (edges[1] - edges[0]) / 2
# Create the plot
p = figure(title='Title Length Distribution',
x_axis_label='Number of Words in Title',
y_axis_label='Frequency')
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color='white')
p.y_range.start = 0
# Show the plot
show(p)
import nltk
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def preprocess(text):
tokens = nltk.word_tokenize(text.lower())
tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
tokens = [lemmatizer.lemmatize(token) for token in tokens]
return tokens
# Preprocess the job titles
df['title_tokens'] = df['title'].apply(preprocess)
# Create a dictionary from the job titles
dictionary = Dictionary(df['title_tokens'])
# Create a corpus from the dictionary and job titles
corpus = [dictionary.doc2bow(title_tokens) for title_tokens in df['title_tokens']]
[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to /home/mehdi/nltk_data... [nltk_data] Package wordnet is already up-to-date!
# Train the LDA model
topics_range = [3,4,5,6,10]
n_topics = 0
cohs = []
for num_topics in topics_range:
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, iterations=400)
cm = CoherenceModel(model=lda_model, texts=df['title_tokens'].values.tolist(), coherence='c_v')
# cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
coherence = cm.get_coherence()
cohs.append(coherence)
# print(f"Number of topics: {num_topics}, Coherence: {coherence}")
print(f"Coherence: {cohs}")
n_topics = topics_range[np.argmax(cohs)]
# print(n_topics)
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10, iterations=500)
Coherence: [0.5183223261652622, 0.5284432948010035, 0.5038624806742165, 0.5269961172403413, 0.5121491219485661]
# Print the top 10 words for each topic
for topic_id, topic_words in lda_model.show_topics(num_topics=n_topics, num_words=10, formatted=False):
print(f"Topic {topic_id}: {[word[0] for word in topic_words]}")
Topic 0: ['operation', 'insight', 'engineer', 'analysis', 'analytics', 'real', 'specialist', 'estate', 'assurance', 'big'] Topic 1: ['data', 'analyst', 'science', 'analytics', 'remote', 'lead', 'intern', 'manager', 'marketing', 'director'] Topic 2: ['data', 'scientist', 'senior', 'remote', 'analytics', 'analyst', 'staff', 'specialist', 'ii', 'product'] Topic 3: ['analyst', 'data', 'business', 'analysis', 'specialist', 'level', 'reporting', 'ii', 'entry', 'junior']
# Get the topic distribution for each job title
df['topic_distribution'] = df['title_tokens'].apply(lambda x: lda_model[dictionary.doc2bow(x)])
df['topic'] = df['topic_distribution'].apply(lambda x: np.argmax(x,axis=0)[1])
# Group job titles by topic and count the number of titles in each group
topic_counts = df.groupby('topic')['title'].count()
# topic_counts
from bokeh.models import FactorRange
from bokeh.palettes import Spectral5
from bokeh.layouts import column
# Create a Bokeh data source with the topic counts
source = ColumnDataSource(data={
'topics': [str(t) for t in topic_counts.index],
'counts': topic_counts.values,
})
# Define the x-axis and y-axis ranges
x_range = FactorRange(factors=source.data['topics'])
y_range = (0, max(source.data['counts']) * 1.1)
# Create a figure object
p = figure(x_range=x_range, y_range=y_range, height=400, width=800, title='Distribution of Job Titles by Topic')
# Add a vertical bar chart to the figure
p.vbar(x='topics', top='counts', width=0.9, source=source, line_color='white', fill_color=Spectral5[0])
# Set visual properties for the figure
# p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
p.xaxis.axis_label = 'Topics'
p.yaxis.axis_label = 'Number of Job Titles'
p.yaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
# p.title.text_font_size = '16pt'
# Create an HTML file for the output and show the figure
# output_file('job_title_distribution.html')
show(p)
# topic_term_matrix = lda_model.get_topics()
# lda_model.get_topic_terms(topicid=0, topn=20)
# lda_model.get_document_topics(corpus[0])
# dictionary.id2token[0]
# topic_term_matrix.shape
# create a heatmap of the topic distributions for each job title
import seaborn as sns
import numpy as np
topic_distributions = np.zeros((len(df), n_topics))
for i, doc in enumerate(corpus):
for topic, prob in lda_model.get_document_topics(doc):
topic_distributions[i][topic] = prob
plt.figure(figsize=(5, 10))
heatmap_plot = sns.heatmap(topic_distributions, cmap='Blues', cbar=True)
heatmap_fig = heatmap_plot.get_figure()
plt.xlabel('Topic')
plt.ylabel('Job Title Id')
plt.title('Topic Distributions for Job Titles')
plt.show()
heatmap_fig.savefig('ff.png')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams
import string
import re
# Extract job titles
job_descriptions = df['description'].tolist()
# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_description = []
for description in job_descriptions:
# Convert to lowercase
description = description.lower()
# Remove punctuation and special characters
description = re.sub(r'[^\w\s]', '', description)
# Tokenize title
description_tokens = word_tokenize(description)
# Remove stop words
description_tokens = [token for token in description_tokens if token not in stop_words]
tokens.extend(description_tokens)
processed_description.append(description_tokens)
# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))
# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)
# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job descriptions', toolbar_location=None, tools="")
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='navy', source=source, line_width=2)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'
# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
from wordcloud import WordCloud
# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)
# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for description in processed_description:
bigrams.extend(list(ngrams(description, 2)))
trigrams.extend(list(ngrams(description, 3)))
# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)
# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
print(bigram, frequency)
# Print the top 10 most common trigrams
print('Top 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
print(trigram, frequency)
Top 10 most common bigrams: ('data', 'science') 968 ('machine', 'learning') 701 ('data', 'analysis') 607 ('data', 'analyst') 546 ('years', 'experience') 537 ('data', 'scientist') 459 ('data', 'analytics') 403 ('computer', 'science') 370 ('equal', 'opportunity') 368 ('bachelor', 'degree') 352 Top 10 most common trigrams: ('equal', 'opportunity', 'employer') 272 ('sexual', 'orientation', 'gender') 248 ('orientation', 'gender', 'identity') 228 ('race', 'color', 'religion') 177 ('without', 'regard', 'race') 161 ('regard', 'race', 'color') 132 ('gender', 'identity', 'expression') 125 ('employment', 'without', 'regard') 117 ('2', 'years', 'experience') 116 ('5', 'years', 'experience') 108
from sklearn.cluster import KMeans
from bertopic import BERTopic
cluster_model = KMeans(n_clusters=n_topics)
# topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", hdbscan_model=cluster_model)
docs = df['title'].values.tolist()
topics, probs = topic_model.fit_transform(docs)
topic_model.get_topic_info()
Topic | Count | Name | |
---|---|---|---|
0 | 0 | 383 | 0_data_analyst_senior_remote |
1 | 1 | 176 | 1_analyst_data_level_entry |
2 | 2 | 164 | 2_scientist_data_science_analytics |
3 | 3 | 67 | 3_senior_scientist_data_specialist |
topic_model.get_document_info(docs)
Document | Topic | Name | Top_n_words | Representative_document | |
---|---|---|---|---|---|
0 | Senior Analyst, Data Science and Analytics | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
1 | Senior Data Scientist | 3 | 3_senior_scientist_data_specialist | senior - scientist - data - specialist - and -... | True |
2 | Lead Data Science Analyst | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
3 | Data Science Intern | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
4 | Data Scientist | 2 | 2_scientist_data_science_analytics | scientist - data - science - analytics - jobs ... | True |
... | ... | ... | ... | ... | ... |
785 | Research and Data Specialist | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
786 | Quality Assurance Data Specialist | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
787 | Senior Data Analyst | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | True |
788 | Cost Controller/Data Analyst | 1 | 1_analyst_data_level_entry | analyst - data - level - entry - hybrid - visu... | False |
789 | Data Specialist | 0 | 0_data_analyst_senior_remote | data - analyst - senior - remote - analytics -... | False |
790 rows × 5 columns
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(docs, show_progress_bar=False)
# # Train BERTopic
# topic_model = BERTopic().fit(docs, embeddings)
# # Run the visualization with the original embeddings
# topic_model.visualize_documents(docs, embeddings=embeddings)
# # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
topic_model.visualize_hierarchy()
topic_model.visualize_heatmap()
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)
# Visualize the token-level distributions
doc_id = 0
print(docs[doc_id])
df_dist = topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])
df_dist
Senior Analyst, Data Science and Analytics
Senior | Analyst | Data | Science | and | Analytics | |
---|---|---|---|---|---|---|
0_data_analyst_senior_remote | 0.459 | 0.818 | 1.156 | 1.156 | 0.697 | 0.338 |
1_analyst_data_level_entry | 0.353 | 0.678 | 0.819 | 0.819 | 0.466 | 0.140 |
2_scientist_data_science_analytics | 0.194 | 0.408 | 0.657 | 0.657 | 0.463 | 0.250 |
3_senior_scientist_data_specialist | 0.439 | 0.556 | 0.556 | 0.556 | 0.118 | 0.000 |
Exercise: Find job title variations. Below is starting code
unique_titles = df['job_title'].unique()
len(unique_titles)
507
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(unique_titles, show_progress_bar=True)
Batches: 0%| | 0/16 [00:00<?, ?it/s]
from sentence_transformers import util
#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)
#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
for j in range(i+1, len(cosine_scores)):
pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
# Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
for pair in pairs[0:10]:
i, j = pair['index']
print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))
hybrid data analyst data analyst hybrid Score: 0.9779 junior data analyst data analyst junior Score: 0.9774 data scientist analytics data scientist data analytics Score: 0.9766 senior data analyst data analyst senior Score: 0.9746 senior data analyst transportation optimization 3 senior data analyst transportation optimization Score: 0.9737 senior data scientist remote senior data scientist us remote Score: 0.9731 senior data scientist data scientist senior Score: 0.9727 business data analyst data analyst business analyst Score: 0.9688 data analyst senior data analyst technical senior Score: 0.9649 senior technical data analyst data analyst technical senior Score: 0.9639
for pair in pairs:
i, j = pair['index']
if pair['score'] >= 0.87:
unique_titles[j] = unique_titles[i]
# print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))