import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6, Category10
from bokeh.io import output_notebook, reset_output

import string
import re

output_notebook()


df = pd.read_csv('ds_jobs.csv')


df.head()


df.describe(include='all')


df.isna().sum()

id              0
title           0
company         0
announcement    0
description     0
dtype: int64


df['announcement'].value_counts()[:20]

LinkedIn                      189
SimplyHired                    79
ZipRecruiter                   66
Salary.com                     41
Startup Jobs                   20
Adzuna                         20
Glassdoor                      20
Greenhouse                     14
Upwork                         13
Built In                       13
Clearance Jobs                 10
SmartRecruiters Job Search      9
AngelList                       7
The Muse                        7
Lever                           6
Mendeley                        6
WayUp                           5
Monster                         5
Nexxt                           5
Dice                            5
Name: announcement, dtype: int64


# Extract job announcements
job_announcements = df['announcement'].value_counts().rename_axis('announcement').reset_index(name='counts')

source = ColumnDataSource(data=job_announcements)

# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of job announcement platforms')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
    xlabel_ticks[t] = job_announcements['announcement'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Platforms'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None


# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)


df['company'].value_counts()

Upwork                                     13
Walmart                                    12
Dice                                       10
Booz Allen Hamilton                        10
Cardinal Health                             6
                                           ..
InfiCare Software Technologies Pvt Ltd.     1
Faire                                       1
Apex Systems                                1
Bluestem Brands                             1
Hicuity Health                              1
Name: company, Length: 665, dtype: int64


# Extract companies
companies = df['company'].value_counts().rename_axis('company').reset_index(name='counts')

source = ColumnDataSource(data=companies)

# Create bar chart
p = figure(height=350, width=600, title=f'Distribution of hiring companies')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='index', y='counts', line_color='red', source=source, line_width=2)
# p.vbar(x='index', top='counts', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
ticks = [0, 20, 40, 100, 150, 200, len(companies) - 1]
p.xaxis.ticker = ticks
xlabel_ticks = {}
for t in ticks:
    xlabel_ticks[t] = companies['company'][t]
p.xaxis.major_label_overrides = xlabel_ticks
p.xaxis.axis_label = 'Companies'
p.yaxis.axis_label = 'Counts'
p.xgrid.grid_line_color = None


# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)


# Define a function to clean job titles
def clean_job_title(job_title):
    job_title = job_title.lower() # convert to lowercase
    job_title = re.sub('[^a-zA-Z0-9\n\.]', ' ', job_title) # remove special characters
    job_title = re.sub('\s+', ' ', job_title) # remove extra whitespaces
    job_title = job_title.strip() # remove leading/trailing whitespaces
    return job_title


# Apply the clean_job_title function to the job title column
df['job_title'] = df['title'].apply(clean_job_title)

# Get the value counts of each job title
job_title_counts = df['job_title'].value_counts()

# Print the top 10 most common job titles
print(job_title_counts)

data scientist                               81
data analyst                                 71
senior data scientist                        29
senior data analyst                          22
business data analyst                        13
                                             ..
data analyst monitoring                       1
vp data science                               1
senior data scientist game security           1
senior data scientist remote phd required     1
cost controller data analyst                  1
Name: job_title, Length: 507, dtype: int64


from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams



# Extract job titles
job_titles = df['title'].tolist()

# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_titles = []
for title in job_titles:
    # Convert to lowercase
    title = title.lower()
    # Remove punctuation and special characters
    title = re.sub(r'[^\w\s]', '', title)
    # Tokenize title
    title_tokens = word_tokenize(title)
    # Remove stop words
    title_tokens = [token for token in title_tokens if token not in stop_words]
    tokens.extend(title_tokens)
    processed_titles.append(title_tokens)


# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))

# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)

# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job Titles')
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='red', source=source, line_width=2)
# p.vbar(x='words', top='frequency', line_color='navy', source=source, width=0.9)

p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'

# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)


from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)

# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# wordcloud.to_file('top20-wordcloud.png')


# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for title in processed_titles:
    bigrams.extend(list(ngrams(title, 2)))
    trigrams.extend(list(ngrams(title, 3)))

# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)

# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
    print(bigram, frequency)

# Print the top 10 most common trigrams
print('\nTop 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
    print(trigram, frequency)

Top 10 most common bigrams:
('data', 'analyst') 341
('data', 'scientist') 277
('senior', 'data') 108
('data', 'science') 43
('data', 'analytics') 35
('sr', 'data') 27
('business', 'data') 25
('data', 'specialist') 23
('lead', 'data') 20
('data', 'analysis') 16

Top 10 most common trigrams:
('senior', 'data', 'scientist') 58
('senior', 'data', 'analyst') 42
('business', 'data', 'analyst') 20
('sr', 'data', 'scientist') 16
('sr', 'data', 'analyst') 11
('staff', 'data', 'scientist') 9
('lead', 'data', 'scientist') 9
('lead', 'data', 'analyst') 9
('entry', 'level', 'data') 8
('level', 'data', 'analyst') 8


bigrams_freqs = {}
for bg in bigram_fdist.items():
    # print(a)
    bigrams_freqs[bg[0][0] + ' ' + bg[0][1]] = bg[1]
    
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(bigrams_freqs)

# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# wordcloud.to_file('bigrams-wordcloud.png')


trigrams_freqs = {}
for tg in trigram_fdist.items():
    trigrams_freqs[tg[0][0] + ' ' + tg[0][1] + ' ' + tg[0][2]] = tg[1]
    
from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(trigrams_freqs)

# Visualize word cloud
plt.figure(figsize=(8, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# wordcloud.to_file('trigrams-wordcloud.png')


from bokeh.palettes import Category20
from bokeh.transform import linear_cmap, factor_cmap
from bokeh.palettes import Blues

# Extract the top 10 most common bigrams and trigrams
top_bigrams = bigram_fdist.most_common(10)
top_trigrams = trigram_fdist.most_common(10)

# Create a list of the bigram and trigram labels
bigram_labels = [', '.join(bigram) for bigram, _ in top_bigrams]
trigram_labels = [', '.join(trigram) for trigram, _ in top_trigrams]

# Create a list of the bigram and trigram frequencies
bigram_frequencies = [frequency for _, frequency in top_bigrams]
trigram_frequencies = [frequency for _, frequency in top_trigrams]

# Create a ColumnDataSource for the bigrams and trigrams
bigram_source = ColumnDataSource(data=dict(labels=bigram_labels, frequencies=bigram_frequencies))
trigram_source = ColumnDataSource(data=dict(labels=trigram_labels, frequencies=trigram_frequencies))

# Define a color map based on the height of the bars
# color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=-100, high=max(bigram_frequencies))
color_mapper = linear_cmap(field_name='frequencies', palette=Blues[9], low=min(bigram_frequencies), high=max(bigram_frequencies))


# Create a figure for the bigrams
bigram_plot = figure(y_range=bigram_labels, height=400, width=600,
                     title='Top 10 most common bigrams in job titles')
bigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=bigram_source,
                 line_color='white')
bigram_plot.x_range.start = 0
bigram_plot.ygrid.grid_line_color = None
# Create a figure for the trigrams
trigram_plot = figure(y_range=trigram_labels, height=400, width=600,
                      title='Top 10 most common trigrams in job titles')
trigram_plot.hbar(y='labels', right='frequencies', height=0.8, source=trigram_source,
                  line_color='white')

trigram_plot.x_range.start = 0
trigram_plot.ygrid.grid_line_color = None

# Show the plots
show(bigram_plot)
show(trigram_plot)


# Define a function to identify seniority level based on keywords in the job title
def get_seniority_level(title):
    senior_keywords = ['senior', 'lead', 'principal', 'vp', 'director', 'staff', 'manager']
    for keyword in senior_keywords:
        if keyword in title.lower():
            return 'Senior'
    return 'Other'

# Apply the get_seniority_level function to the job title column and create a new seniority_level column
df['seniority_level'] = df['title'].apply(get_seniority_level)

# Group the data by seniority level and count the number of job titles in each group
grouped_df = df.groupby('seniority_level')['title'].count().reset_index(name='count')

# Create a Bokeh ColumnDataSource object for the bar chart
source = ColumnDataSource(grouped_df)

# Create the bar chart with Bokeh
p = figure(x_range=grouped_df['seniority_level'], height=300, width=400,
           title='Distribution of Job Titles by Seniority Level')

p.vbar(x='seniority_level', top='count', width=0.9, source=source)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.axis_label = 'Seniority Level'
p.yaxis.axis_label = 'Number of Job Titles'

show(p)


# Apply the clean_job_title function to the job title column
df['description'] = df['description'].apply(clean_job_title)

# Define the regex pattern to extract years of experience from the job description
exp_regex = r'(\d+)\+? year[s]? ?(?:of )?experience'

# Extract years of experience from the job description using the regex pattern
df['years_of_experience'] = df['description'].str.extract(exp_regex)
df['years_of_experience'] = df['years_of_experience'].fillna(0).astype(int)


# Create a Bokeh ColumnDataSource object for the scatter plot
source = ColumnDataSource(df[['id', 'years_of_experience']])

# Create the bar chart with Bokeh
p = figure( height=300, width=600,
           title='Distribution of titles by years of experience')

p.circle(x='id', y='years_of_experience', size=5,   source=source)

show(p)


from bokeh.models import  HoverTool
# Calculate length of job titles
df['title_length'] = df['title'].apply(lambda x: len(x.split()))

# Create a histogram of title lengths
title_lengths = df['title_length'].tolist()
hist, edges = np.histogram(df['title_length'])

# Shift to center the tick labels
edges = edges - (edges[1] - edges[0]) / 2

# Create the plot
p = figure(title='Title Length Distribution',
           x_axis_label='Number of Words in Title',
           y_axis_label='Frequency')

p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color='white')
p.y_range.start = 0

# Show the plot
show(p)


import nltk
from nltk.stem import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Preprocess the job titles
df['title_tokens'] = df['title'].apply(preprocess)

# Create a dictionary from the job titles
dictionary = Dictionary(df['title_tokens'])

# Create a corpus from the dictionary and job titles
corpus = [dictionary.doc2bow(title_tokens) for title_tokens in df['title_tokens']]

[nltk_data] Downloading package stopwords to /home/mehdi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mehdi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Train the LDA model
topics_range = [3,4,5,6,10]
n_topics = 0
cohs = []
for num_topics in topics_range:
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10, iterations=400)

    cm = CoherenceModel(model=lda_model, texts=df['title_tokens'].values.tolist(), coherence='c_v')
    # cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')

    coherence = cm.get_coherence()
    cohs.append(coherence)
    # print(f"Number of topics: {num_topics}, Coherence: {coherence}")


print(f"Coherence: {cohs}")
n_topics = topics_range[np.argmax(cohs)]
# print(n_topics)
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, passes=10, iterations=500)

Coherence: [0.5183223261652622, 0.5284432948010035, 0.5038624806742165, 0.5269961172403413, 0.5121491219485661]


# Print the top 10 words for each topic
for topic_id, topic_words in lda_model.show_topics(num_topics=n_topics, num_words=10, formatted=False):
    print(f"Topic {topic_id}: {[word[0] for word in topic_words]}")

Topic 0: ['operation', 'insight', 'engineer', 'analysis', 'analytics', 'real', 'specialist', 'estate', 'assurance', 'big']
Topic 1: ['data', 'analyst', 'science', 'analytics', 'remote', 'lead', 'intern', 'manager', 'marketing', 'director']
Topic 2: ['data', 'scientist', 'senior', 'remote', 'analytics', 'analyst', 'staff', 'specialist', 'ii', 'product']
Topic 3: ['analyst', 'data', 'business', 'analysis', 'specialist', 'level', 'reporting', 'ii', 'entry', 'junior']


# Get the topic distribution for each job title
df['topic_distribution'] = df['title_tokens'].apply(lambda x: lda_model[dictionary.doc2bow(x)])
df['topic'] = df['topic_distribution'].apply(lambda x: np.argmax(x,axis=0)[1])


# Group job titles by topic and count the number of titles in each group
topic_counts = df.groupby('topic')['title'].count()
# topic_counts


from bokeh.models import  FactorRange
from bokeh.palettes import Spectral5
from bokeh.layouts import column

# Create a Bokeh data source with the topic counts
source = ColumnDataSource(data={
    'topics': [str(t) for t in topic_counts.index],
    'counts': topic_counts.values,
})

# Define the x-axis and y-axis ranges
x_range = FactorRange(factors=source.data['topics'])
y_range = (0, max(source.data['counts']) * 1.1)

# Create a figure object
p = figure(x_range=x_range, y_range=y_range, height=400, width=800, title='Distribution of Job Titles by Topic')

# Add a vertical bar chart to the figure
p.vbar(x='topics', top='counts', width=0.9, source=source, line_color='white', fill_color=Spectral5[0])

# Set visual properties for the figure
# p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None
p.xaxis.axis_label = 'Topics'
p.yaxis.axis_label = 'Number of Job Titles'
p.yaxis.minor_tick_line_color = None
p.yaxis.major_tick_line_color = None
# p.title.text_font_size = '16pt'

# Create an HTML file for the output and show the figure
# output_file('job_title_distribution.html')
show(p)


# topic_term_matrix = lda_model.get_topics()
# lda_model.get_topic_terms(topicid=0, topn=20)
# lda_model.get_document_topics(corpus[0])
# dictionary.id2token[0]
# topic_term_matrix.shape


# create a heatmap of the topic distributions for each job title
import seaborn as sns
import numpy as np

topic_distributions = np.zeros((len(df), n_topics))
for i, doc in enumerate(corpus):
    for topic, prob in lda_model.get_document_topics(doc):
        topic_distributions[i][topic] = prob

plt.figure(figsize=(5, 10))
heatmap_plot = sns.heatmap(topic_distributions, cmap='Blues', cbar=True)
heatmap_fig = heatmap_plot.get_figure()
plt.xlabel('Topic')
plt.ylabel('Job Title Id')
plt.title('Topic Distributions for Job Titles')
plt.show()
heatmap_fig.savefig('ff.png')


from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk import ngrams
import string
import re


# Extract job titles
job_descriptions = df['description'].tolist()

# Preprocessing
stop_words = set(stopwords.words('english'))
tokens = []
processed_description = []
for description in job_descriptions:
    # Convert to lowercase
    description = description.lower()
    # Remove punctuation and special characters
    description = re.sub(r'[^\w\s]', '', description)
    # Tokenize title
    description_tokens = word_tokenize(description)
    # Remove stop words
    description_tokens = [token for token in description_tokens if token not in stop_words]
    tokens.extend(description_tokens)
    processed_description.append(description_tokens)


# Calculate word frequency
fdist = FreqDist(tokens)
top_n = 20
top_words = dict(fdist.most_common(top_n))

# Prepare data for visualization
data = {'words': list(top_words.keys()), 'frequency': list(top_words.values())}
source = ColumnDataSource(data=data)

# Create bar chart
p = figure(x_range=data['words'], height=350, title=f'Top {top_n} Most Common Words in Job descriptions', toolbar_location=None, tools="")
# p.vbar(x='words', top='frequency', width=0.9, color='color', legend_field='words', source=source)
p.line(x='words', y='frequency', line_color='navy', source=source, line_width=2)
p.xaxis.major_label_orientation = 3.14/4 # rotate labels by 45 degrees
# p.circle(x='words', y='frequency', color='color', legend_field='words', source=source, size=8)
p.xgrid.grid_line_color = None
# p.legend.orientation = 'horizontal'
# p.legend.location = 'top_center'

# Output visualization to HTML file
# output_file('word_frequency.html')
show(p)


from wordcloud import WordCloud

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(fdist)

# Visualize word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


# Calculate bigrams and trigrams
bigrams = []
trigrams = []
for description in processed_description:
    bigrams.extend(list(ngrams(description, 2)))
    trigrams.extend(list(ngrams(description, 3)))

# Calculate frequency distribution of bigrams and trigrams
bigram_fdist = FreqDist(bigrams)
trigram_fdist = FreqDist(trigrams)

# Print the top 10 most common bigrams
print('Top 10 most common bigrams:')
for bigram, frequency in bigram_fdist.most_common(10):
    print(bigram, frequency)

# Print the top 10 most common trigrams
print('Top 10 most common trigrams:')
for trigram, frequency in trigram_fdist.most_common(10):
    print(trigram, frequency)

Top 10 most common bigrams:
('data', 'science') 968
('machine', 'learning') 701
('data', 'analysis') 607
('data', 'analyst') 546
('years', 'experience') 537
('data', 'scientist') 459
('data', 'analytics') 403
('computer', 'science') 370
('equal', 'opportunity') 368
('bachelor', 'degree') 352
Top 10 most common trigrams:
('equal', 'opportunity', 'employer') 272
('sexual', 'orientation', 'gender') 248
('orientation', 'gender', 'identity') 228
('race', 'color', 'religion') 177
('without', 'regard', 'race') 161
('regard', 'race', 'color') 132
('gender', 'identity', 'expression') 125
('employment', 'without', 'regard') 117
('2', 'years', 'experience') 116
('5', 'years', 'experience') 108


from sklearn.cluster import KMeans
from bertopic import BERTopic

cluster_model = KMeans(n_clusters=n_topics)
# topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2")
topic_model = BERTopic(embedding_model="all-MiniLM-L6-v2", hdbscan_model=cluster_model)


docs = df['title'].values.tolist()
topics, probs = topic_model.fit_transform(docs)


topic_model.get_topic_info()


topic_model.get_document_info(docs)


from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP


# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(docs, show_progress_bar=False)


# # Train BERTopic
# topic_model = BERTopic().fit(docs, embeddings)

# # Run the visualization with the original embeddings
# topic_model.visualize_documents(docs, embeddings=embeddings)

# # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)


topic_model.visualize_hierarchy()


topic_model.visualize_heatmap()


# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs, calculate_tokens=True)

# Visualize the token-level distributions
doc_id = 0
print(docs[doc_id])
df_dist = topic_model.visualize_approximate_distribution(docs[doc_id], topic_token_distr[doc_id])
df_dist

Senior Analyst, Data Science and Analytics


unique_titles = df['job_title'].unique()
len(unique_titles)

507


sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(unique_titles, show_progress_bar=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]


from sentence_transformers import util

#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.cos_sim(embeddings, embeddings)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
        

# Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))

hybrid data analyst 		 data analyst hybrid 		 Score: 0.9779
junior data analyst 		 data analyst junior 		 Score: 0.9774
data scientist analytics 		 data scientist data analytics 		 Score: 0.9766
senior data analyst 		 data analyst senior 		 Score: 0.9746
senior data analyst transportation optimization 3 		 senior data analyst transportation optimization 		 Score: 0.9737
senior data scientist remote 		 senior data scientist us remote 		 Score: 0.9731
senior data scientist 		 data scientist senior 		 Score: 0.9727
business data analyst 		 data analyst business analyst 		 Score: 0.9688
data analyst senior 		 data analyst technical senior 		 Score: 0.9649
senior technical data analyst 		 data analyst technical senior 		 Score: 0.9639


for pair in pairs:
    i, j = pair['index']
    if pair['score'] >= 0.87:
        unique_titles[j] = unique_titles[i]
    # print("{} \t\t {} \t\t Score: {:.4f}".format(unique_titles[i], unique_titles[j], pair['score']))

	id	title	company	announcement	description
0	0	Senior Analyst, Data Science and Analytics	TransUnion	The Muse	TransUnion's Job Applicant Privacy Notice Wha...
1	1	Senior Data Scientist	Grubhub Holdings, Inc.	ZipRecruiter	About The Opportunity We're all about connect...
2	2	Lead Data Science Analyst	Discover Financial Services	LinkedIn	Discover. A brighter future. With us, you’ll ...
3	3	Data Science Intern	AbelsonTaylor	Startup Jobs	Are you a 2023 college graduate or rising coll...
4	4	Data Scientist	NORC at the University of Chicago	SimplyHired	JOB DESCRIPTION: At NORC, Data Scientists pla...

	id	title	company	announcement	description
count	790.000000	790	790	790	790
unique	NaN	515	665	203	790
top	NaN	Data Scientist	Upwork	LinkedIn	TransUnion's Job Applicant Privacy Notice Wha...
freq	NaN	80	13	189	1
mean	394.500000	NaN	NaN	NaN	NaN
std	228.197648	NaN	NaN	NaN	NaN
min	0.000000	NaN	NaN	NaN	NaN
25%	197.250000	NaN	NaN	NaN	NaN
50%	394.500000	NaN	NaN	NaN	NaN
75%	591.750000	NaN	NaN	NaN	NaN
max	789.000000	NaN	NaN	NaN	NaN

	Topic	Count	Name
0	0	383	0_data_analyst_senior_remote
1	1	176	1_analyst_data_level_entry
2	2	164	2_scientist_data_science_analytics
3	3	67	3_senior_scientist_data_specialist

	Document	Topic	Name	Top_n_words	Representative_document
0	Senior Analyst, Data Science and Analytics	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False
1	Senior Data Scientist	3	3_senior_scientist_data_specialist	senior - scientist - data - specialist - and -...	True
2	Lead Data Science Analyst	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False
3	Data Science Intern	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False
4	Data Scientist	2	2_scientist_data_science_analytics	scientist - data - science - analytics - jobs ...	True
...	...	...	...	...	...
785	Research and Data Specialist	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False
786	Quality Assurance Data Specialist	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False
787	Senior Data Analyst	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	True
788	Cost Controller/Data Analyst	1	1_analyst_data_level_entry	analyst - data - level - entry - hybrid - visu...	False
789	Data Specialist	0	0_data_analyst_senior_remote	data - analyst - senior - remote - analytics -...	False

	Senior	Analyst	Data	Science	and	Analytics
0_data_analyst_senior_remote	0.459	0.818	1.156	1.156	0.697	0.338
1_analyst_data_level_entry	0.353	0.678	0.819	0.819	0.466	0.140
2_scientist_data_science_analytics	0.194	0.408	0.657	0.657	0.463	0.250
3_senior_scientist_data_specialist	0.439	0.556	0.556	0.556	0.118	0.000

N-gram Analysis: Analyze the n-grams in job titles to identify patterns or trends in job titles.¶

Identify Seniority level¶

Extracting `years of experience` from job descriptions¶

There is an outlier. there is 100 years of experience for a title. It seems our regular expersion has extracted the wrong number¶

Distribution of job title lengths¶

Topic modeling¶

Job description analysis¶

Topic modeling using BerTopic¶

Similarity between job titles¶

N-gram Analysis: Analyze the n-grams in job titles to identify patterns or trends in job titles.¶

Identify Seniority level¶

Extracting years of experience from job descriptions¶

There is an outlier. there is 100 years of experience for a title. It seems our regular expersion has extracted the wrong number¶

Distribution of job title lengths¶

Topic modeling¶

Job description analysis¶

Topic modeling using BerTopic¶

Similarity between job titles¶

Extracting `years of experience` from job descriptions¶