In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6, Category10
from bokeh.io import output_notebook, reset_output

import string
import re

output_notebook()
Loading BokehJS ...
In [34]:
df = pd.read_csv('ds_jobs.csv')
In [35]:
df.head()
Out[35]:
id title company announcement description
0 0 Senior Analyst, Data Science and Analytics TransUnion The Muse TransUnion's Job Applicant Privacy Notice Wha...
1 1 Senior Data Scientist Grubhub Holdings, Inc. ZipRecruiter About The Opportunity We're all about connect...
2 2 Lead Data Science Analyst Discover Financial Services LinkedIn Discover. A brighter future. With us, you’ll ...
3 3 Data Science Intern AbelsonTaylor Startup Jobs Are you a 2023 college graduate or rising coll...
4 4 Data Scientist NORC at the University of Chicago SimplyHired JOB DESCRIPTION: At NORC, Data Scientists pla...
In [36]:
df.describe(include='all')
Out[36]:
id title company announcement description
count 790.000000 790 790 790 790
unique NaN 515 665 203 790
top NaN Data Scientist Upwork LinkedIn TransUnion's Job Applicant Privacy Notice Wha...
freq NaN 80 13 189 1
mean 394.500000 NaN NaN NaN NaN
std 228.197648 NaN NaN NaN NaN
min 0.000000 NaN NaN NaN NaN
25% 197.250000 NaN NaN NaN NaN
50% 394.500000 NaN NaN NaN NaN
75% 591.750000 NaN NaN NaN NaN
max 789.000000 NaN NaN NaN NaN
In [37]:
df.isna().sum()
Out[37]:
id              0
title           0
company         0
announcement    0
description     0
dtype: int64
In [38]:
df['announcement'].value_counts()[:20]
Out[38]:
LinkedIn                      189
SimplyHired                    79
ZipRecruiter                   66
Salary.com                     41
Startup Jobs                   20
Adzuna                         20
Glassdoor                      20
Greenhouse                     14
Upwork                         13
Built In                       13
Clearance Jobs                 10
SmartRecruiters Job Search      9
AngelList                       7
The Muse                        7
Lever                           6
Mendeley                        6
WayUp                           5
Monster                         5
Nexxt                           5
Dice                            5
Name: announcement, dtype: int64
In [39]:
# Extract job announcements
job_announcements = df['announcement'].value_counts().rename_axis('announcement').reset_index(name='counts')

source = ColumnDataSource(data=job_announcements)

# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of job announcement platforms')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
    xlabel_ticks[t] = job_announcements['announcement'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Platforms'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None


# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
In [40]:
df['company'].value_counts()
Out[40]:
Upwork                                     13
Walmart                                    12
Dice                                       10
Booz Allen Hamilton                        10
Cardinal Health                             6
                                           ..
InfiCare Software Technologies Pvt Ltd.     1
Faire                                       1
Apex Systems                                1
Bluestem Brands                             1
Hicuity Health                              1
Name: company, Length: 665, dtype: int64
In [41]:
# Extract companies
companies = df['company'].value_counts().rename_axis('company').reset_index(name='counts')

source = ColumnDataSource(data=companies)

# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of hiring companies')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200, len(companies) - 1]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
    xlabel_ticks[t] = companies['company'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Companies'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None


# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
In [42]:
# Define a function to clean job titles
def clean_job_title(job_title):
    job_title = job_title.lower() # convert to lowercase
    job_title = re.sub('[^a-zA-Z0-9\n\.]', ' ', job_title) # remove special characters
    job_title = re.sub('\s+', ' ', job_title) # remove extra whitespaces
    job_title = job_title.strip() # remove leading/trailing whitespaces
    return job_title


# Apply the clean_job_title function to the job title column
df['job_title'] = df['title'].apply(clean_job_title)

# Get the value counts of each job title
job_title_counts = df['job_title'].value_counts()

# Print the top 10 most common job titles
print(job_title_counts)
data scientist                               81
data analyst                                 71
senior data scientist                        29
senior data analyst                          22
business data analyst                        13
                                             ..
data analyst monitoring                       1
vp data science                               1
senior data scientist game security           1
senior data scientist remote phd required     1
cost controller data analyst                  1
Name: job_title, Length: 507, dtype: int64
In [43]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams



# Extract job titles
job_titles = df['title'].tolist()

# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_titles = []
for title in job_titles:
    # Convert to lowercase
    title = title.lower()
    # Remove punctuation and special characters
    title = re.sub(r'[^\w\s]', '', title)
    # Tokenize title
    title_tokens = word_tokenize(title)
    # Remove stop words
    title_tokens = [token for token in title_tokens if token not in stop_words]
    tokens.extend(title_tokens)
    processed_titles.append(title_tokens)


# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))

# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)

# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job Titles')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='red', source=source, line_width=2)
# p.vbar(x='words', top='frequency', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'

# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
In [44]:
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)

# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file('top20-wordcloud.png')

N-gram Analysis: Analyze the n-grams in job titles to identify patterns or trends in job titles.¶

In [45]:
# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for title in processed_titles:
    bigrams.extend(list(ngrams(title, 2)))
    trigrams.extend(list(ngrams(title, 3)))

# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)

# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
    print(bigram, frequency)

# Print the top 10 most common trigrams
print('\nTop 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
    print(trigram, frequency)
Top 10 most common bigrams:
('data', 'analyst') 341
('data', 'scientist') 277
('senior', 'data') 108
('data', 'science') 43
('data', 'analytics') 35
('sr', 'data') 27
('business', 'data') 25
('data', 'specialist') 23
('lead', 'data') 20
('data', 'analysis') 16

Top 10 most common trigrams:
('senior', 'data', 'scientist') 58
('senior', 'data', 'analyst') 42
('business', 'data', 'analyst') 20
('sr', 'data', 'scientist') 16
('sr', 'data', 'analyst') 11
('staff', 'data', 'scientist') 9
('lead', 'data', 'scientist') 9
('lead', 'data', 'analyst') 9
('entry', 'level', 'data') 8
('level', 'data', 'analyst') 8
In [46]:
bigrams_freqs = {}
for bg in bigram_fdist.items():
    # print(a)
    bigrams_freqs[bg[0][0] + ' ' + bg[0][1]] = bg[1]
    
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(bigrams_freqs)

# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# wordcloud.to_file('bigrams-wordcloud.png')
In [47]:
trigrams_freqs = {}
for tg in trigram_fdist.items():
    trigrams_freqs[tg[0][0] + ' ' + tg[0][1] + ' ' + tg[0][2]] = tg[1]
    
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(trigrams_freqs)

# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# wordcloud.to_file('trigrams-wordcloud.png')
In [48]:
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Blues

# Extract the top 10 most common bigrams and trigrams
top_bigrams = bigram_fdist.most_common(10)
top_trigrams = trigram_fdist.most_common(10)

# Create a list of the bigram and trigram labels
bigram_labels = [', '.join(bigram) for bigram, _ in top_bigrams]
trigram_labels = [', '.join(trigram) for trigram, _ in top_trigrams]

# Create a list of the bigram and trigram frequencies
bigram_frequencies = [frequency for _, frequency in top_bigrams]
trigram_frequencies = [frequency for _, frequency in top_trigrams]

# Create a ColumnDataSource for the bigrams and trigrams
bigram_source = ColumnDataSource(data=dict(labels=bigram_labels, frequencies=bigram_frequencies))
trigram_source = ColumnDataSource(data=dict(labels=trigram_labels, frequencies=trigram_frequencies))

# Define a color map based on the height of the bars
# color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=-100, high=max(bigram_frequencies))
color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=min(bigram_frequencies), high=max(bigram_frequencies))


# Create a figure for the bigrams
bigram_plot = figure(y_range=bigram_labels, height=400, width=600,
                     title='Top 10 most common bigrams in job titles')
bigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=bigram_source,
                 line_color='white')
bigram_plot.x_range.start = 0
bigram_plot.ygrid.grid_line_color = None
# Create a figure for the trigrams
trigram_plot = figure(y_range=trigram_labels, height=400, width=600,
                      title='Top 10 most common trigrams in job titles')
trigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=trigram_source,
                  line_color='white')

trigram_plot.x_range.start = 0
trigram_plot.ygrid.grid_line_color = None

# Show the plots
show(bigram_plot)
show(trigram_plot)

Identify Seniority level¶

In [49]:
# Define a function to identify seniority level based on keywords in the job title
def get_seniority_level(title):
    senior_keywords = ['senior', 'lead', 'principal', 'vp', 'director', 'staff', 'manager']
    for keyword in senior_keywords:
        if keyword in title.lower():
            return 'Senior'
    return 'Other'

# Apply the get_seniority_level function to the job title column and create a new seniority_level column
df['seniority_level'] = df['title'].apply(get_seniority_level)

# Group the data by seniority level and count the number of job titles in each group
grouped_df = df.groupby('seniority_level')['title'].count().reset_index(name='count')

# Create a Bokeh ColumnDataSource object for the bar chart
source = ColumnDataSource(grouped_df)

# Create the bar chart with Bokeh
p = figure(x_range=grouped_df['seniority_level'], height=300, width=400,
           title='Distribution of Job Titles by Seniority Level')

p.vbar(x='seniority_level', top='count', width=0.9, source=source)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.axis_label = 'Seniority Level'
p.yaxis.axis_label = 'Number of Job Titles'

show(p)

Extracting years of experience from job descriptions¶

In [50]:
# Apply the clean_job_title function to the job title column
df['description'] = df['description'].apply(clean_job_title)

# Define the regex pattern to extract years of experience from the job description
exp_regex = r'(\d+)\+? year[s]? ?(?:of )?experience'

# Extract years of experience from the job description using the regex pattern
df['years_of_experience'] = df['description'].str.extract(exp_regex)
df['years_of_experience'] = df['years_of_experience'].fillna(0).astype(int)
In [51]:
# Create a Bokeh ColumnDataSource object for the scatter plot
source = ColumnDataSource(df[['id', 'years_of_experience']])

# Create the bar chart with Bokeh
p = figure( height=300, width=600,
           title='Distribution of titles by years of experience')

p.circle(x='id', y='years_of_experience', size=5,   source=source)

show(p)

There is an outlier. there is 100 years of experience for a title. It seems our regular expersion has extracted the wrong number¶

Distribution of job title lengths¶

In [52]:
from bokeh.models import  HoverTool
# Calculate length of job titles
df['title_length'] = df['title'].apply(lambda x: len(x.split()))

# Create a histogram of title lengths
title_lengths = df['title_length'].tolist()
hist, edges = np.histogram(df['title_length'])

# Shift to center the tick labels
edges = edges - (edges[1] - edges[0]) / 2

# Create the plot
p = figure(title='Title Length Distribution',
           x_axis_label='Number of Words in Title',
           y_axis_label='Frequency')

p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color='white')
p.y_range.start = 0

# Show the plot
show(p)

Topic modeling¶

In [53]:
import nltk
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Preprocess the job titles
df['title_tokens'] = df['title'].apply(preprocess)

# Create a dictionary from the job titles
dictionary = Dictionary(df['title_tokens'])

# Create a corpus from the dictionary and job titles
corpus = [dictionary.doc2bow(title_tokens) for title_tokens in df['title_tokens']]
[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mehdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
In [54]:
# Train the LDA model
topics_range = [3,4,5,6,10]
n_topics = 0
cohs = []
for num_topics in topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, iterations=400)

    cm = CoherenceModel(model=lda_model, texts=df['title_tokens'].values.tolist(), coherence='c_v')
    # cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')

    coherence = cm.get_coherence()
    cohs.append(coherence)
    # print(f"Number of topics: {num_topics}, Coherence: {coherence}")   
In [55]:
print(f"Coherence: {cohs}")
n_topics = topics_range[np.argmax(cohs)]
# print(n_topics)
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10, iterations=500)
Coherence: [0.5183223261652622, 0.5284432948010035, 0.5038624806742165, 0.5269961172403413, 0.5121491219485661]
In [56]:
# Print the top 10 words for each topic
for topic_id, topic_words in lda_model.show_topics(num_topics=n_topics, num_words=10, formatted=False):
    print(f"Topic {topic_id}: {[word[0] for word in topic_words]}")
Topic 0: ['operation', 'insight', 'engineer', 'analysis', 'analytics', 'real', 'specialist', 'estate', 'assurance', 'big']
Topic 1: ['data', 'analyst', 'science', 'analytics', 'remote', 'lead', 'intern', 'manager', 'marketing', 'director']
Topic 2: ['data', 'scientist', 'senior', 'remote', 'analytics', 'analyst', 'staff', 'specialist', 'ii', 'product']
Topic 3: ['analyst', 'data', 'business', 'analysis', 'specialist', 'level', 'reporting', 'ii', 'entry', 'junior']
In [57]:
# Get the topic distribution for each job title
df['topic_distribution'] = df['title_tokens'].apply(lambda x: lda_model[dictionary.doc2bow(x)])
df['topic'] = df['topic_distribution'].apply(lambda x: np.argmax(x,axis=0)[1])
In [58]:
# Group job titles by topic and count the number of titles in each group
topic_counts = df.groupby('topic')['title'].count()
# topic_counts
In [59]:
from bokeh.models import  FactorRange
from bokeh.palettes import Spectral5
from bokeh.layouts import column

# Create a Bokeh data source with the topic counts
source = ColumnDataSource(data={
    'topics': [str(t) for t in topic_counts.index],
    'counts': topic_counts.values,
})

# Define the x-axis and y-axis ranges
x_range = FactorRange(factors=source.data['topics'])
y_range = (0, max(source.data['counts']) * 1.1)

# Create a figure object
p = figure(x_range=x_range, y_range=y_range, height=400, width=800, title='Distribution of Job Titles by Topic')

# Add a vertical bar chart to the figure
p.vbar(x='topics', top='counts', width=0.9, source=source, line_color='white', fill_color=Spectral5[0])

# Set visual properties for the figure
# p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
p.xaxis.axis_label = 'Topics'
p.yaxis.axis_label = 'Number of Job Titles'
p.yaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
# p.title.text_font_size = '16pt'

# Create an HTML file for the output and show the figure
# output_file('job_title_distribution.html')
show(p)
In [60]:
# topic_term_matrix = lda_model.get_topics()
# lda_model.get_topic_terms(topicid=0, topn=20)
# lda_model.get_document_topics(corpus[0])
# dictionary.id2token[0]
# topic_term_matrix.shape
In [61]:
# create a heatmap of the topic distributions for each job title
import seaborn as sns
import numpy as np

topic_distributions = np.zeros((len(df), n_topics))
for i, doc in enumerate(corpus):
    for topic, prob in lda_model.get_document_topics(doc):
        topic_distributions[i][topic] = prob

plt.figure(figsize=(5, 10))
heatmap_plot = sns.heatmap(topic_distributions, cmap='Blues', cbar=True)
heatmap_fig = heatmap_plot.get_figure()
plt.xlabel('Topic')
plt.ylabel('Job Title Id')
plt.title('Topic Distributions for Job Titles')
plt.show()
heatmap_fig.savefig('ff.png')

Job description analysis¶

In [63]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams
import string
import re


# Extract job titles
job_descriptions = df['description'].tolist()

# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_description = []
for description in job_descriptions:
    # Convert to lowercase
    description = description.lower()
    # Remove punctuation and special characters
    description = re.sub(r'[^\w\s]', '', description)
    # Tokenize title
    description_tokens = word_tokenize(description)
    # Remove stop words
    description_tokens = [token for token in description_tokens if token not in stop_words]
    tokens.extend(description_tokens)
    processed_description.append(description_tokens)


# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))

# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)

# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job descriptions', toolbar_location=None, tools="")
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='navy', source=source, line_width=2)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'

# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)
In [64]:
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)

# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
In [65]:
# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for description in processed_description:
    bigrams.extend(list(ngrams(description, 2)))
    trigrams.extend(list(ngrams(description, 3)))

# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)

# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
    print(bigram, frequency)

# Print the top 10 most common trigrams
print('Top 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
    print(trigram, frequency)
Top 10 most common bigrams:
('data', 'science') 968
('machine', 'learning') 701
('data', 'analysis') 607
('data', 'analyst') 546
('years', 'experience') 537
('data', 'scientist') 459
('data', 'analytics') 403
('computer', 'science') 370
('equal', 'opportunity') 368
('bachelor', 'degree') 352
Top 10 most common trigrams:
('equal', 'opportunity', 'employer') 272
('sexual', 'orientation', 'gender') 248
('orientation', 'gender', 'identity') 228
('race', 'color', 'religion') 177
('without', 'regard', 'race') 161
('regard', 'race', 'color') 132
('gender', 'identity', 'expression') 125
('employment', 'without', 'regard') 117
('2', 'years', 'experience') 116
('5', 'years', 'experience') 108

Topic modeling using BerTopic¶

In [66]:
from sklearn.cluster import KMeans
from bertopic import BERTopic

cluster_model = KMeans(n_clusters=n_topics)
# topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", hdbscan_model=cluster_model)
In [68]:
docs = df['title'].values.tolist()
topics, probs = topic_model.fit_transform(docs)
In [69]:
topic_model.get_topic_info()
Out[69]:
Topic Count Name
0 0 383 0_data_analyst_senior_remote
1 1 176 1_analyst_data_level_entry
2 2 164 2_scientist_data_science_analytics
3 3 67 3_senior_scientist_data_specialist
In [70]:
topic_model.get_document_info(docs)
Out[70]:
Document Topic Name Top_n_words Representative_document
0 Senior Analyst, Data Science and Analytics 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False
1 Senior Data Scientist 3 3_senior_scientist_data_specialist senior - scientist - data - specialist - and -... True
2 Lead Data Science Analyst 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False
3 Data Science Intern 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False
4 Data Scientist 2 2_scientist_data_science_analytics scientist - data - science - analytics - jobs ... True
... ... ... ... ... ...
785 Research and Data Specialist 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False
786 Quality Assurance Data Specialist 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False
787 Senior Data Analyst 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... True
788 Cost Controller/Data Analyst 1 1_analyst_data_level_entry analyst - data - level - entry - hybrid - visu... False
789 Data Specialist 0 0_data_analyst_senior_remote data - analyst - senior - remote - analytics -... False

790 rows × 5 columns

In [71]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
In [121]:
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(docs, show_progress_bar=False)


# # Train BERTopic
# topic_model = BERTopic().fit(docs, embeddings)

# # Run the visualization with the original embeddings
# topic_model.visualize_documents(docs, embeddings=embeddings)

# # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
In [58]:
topic_model.visualize_hierarchy()
In [72]:
topic_model.visualize_heatmap()
In [73]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
doc_id = 0
print(docs[doc_id])
df_dist = topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])
df_dist
Senior Analyst, Data Science and Analytics
Out[73]:
  Senior Analyst Data Science and Analytics
0_data_analyst_senior_remote 0.459 0.818 1.156 1.156 0.697 0.338
1_analyst_data_level_entry 0.353 0.678 0.819 0.819 0.466 0.140
2_scientist_data_science_analytics 0.194 0.408 0.657 0.657 0.463 0.250
3_senior_scientist_data_specialist 0.439 0.556 0.556 0.556 0.118 0.000

Similarity between job titles¶

Exercise: Find job title variations. Below is starting code

In [76]:
unique_titles = df['job_title'].unique()
len(unique_titles)
Out[76]:
507
In [77]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(unique_titles, show_progress_bar=True)
Batches:   0%|          | 0/16 [00:00<?, ?it/s]
In [78]:
from sentence_transformers import util

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
        

# Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))
hybrid data analyst 		 data analyst hybrid 		 Score: 0.9779
junior data analyst 		 data analyst junior 		 Score: 0.9774
data scientist analytics 		 data scientist data analytics 		 Score: 0.9766
senior data analyst 		 data analyst senior 		 Score: 0.9746
senior data analyst transportation optimization 3 		 senior data analyst transportation optimization 		 Score: 0.9737
senior data scientist remote 		 senior data scientist us remote 		 Score: 0.9731
senior data scientist 		 data scientist senior 		 Score: 0.9727
business data analyst 		 data analyst business analyst 		 Score: 0.9688
data analyst senior 		 data analyst technical senior 		 Score: 0.9649
senior technical data analyst 		 data analyst technical senior 		 Score: 0.9639
In [212]:
for pair in pairs:
    i, j = pair['index']
    if pair['score'] >= 0.87:
        unique_titles[j] = unique_titles[i]
    # print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))
In [ ]: