Skip to content

Theyka/JobScan

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

34 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Project Ownership


Tech Stack:

  • Python
  • Tailwind CSS
  • Supabase

How to configure:

  1. Frontend config:

    • Copy frontend/config.js.example to frontend/config.js
    • Set:
      • SUPABASE_URL
      • SUPABASE_ANON_KEY
  2. Backend config:

    • Copy backend/.env.example to backend/.env
    • Set:
      • SUPABASE_URL
      • SUPABASE_KEY
      • SUPABASE_SERVICE_KEY
      • SCRAPE_TIME_INTERVAL (seconds, default 600)
  3. Run frontend:

    • python3 serve_frontend.py --port 8080
    • Open http://127.0.0.1:8080/
  4. Run backend scraper:

    • python3 backend/main.py
    • Each cycle also runs duplicate detection and refreshes duplicate_jobs

Supabase Database Schema:

create table public.glorri_companies (
  id bigint generated by default as identity not null,
  slug text null,
  name text null,
  logo text null,
  created_at timestamp with time zone not null default now(),
  constraint glorri_companies_pkey primary key (id)
) TABLESPACE pg_default;

create table public.glorri_vacancies (
  id bigint generated by default as identity not null,
  title text null,
  slug text null,
  "postedDate" text null,
  "jobFunction" text null,
  "careerLevel" text null,
  location text null,
  type text null,
  detail_url text null,
  text text null,
  vacancy_about jsonb null,
  benefits jsonb null,
  apply_url text null,
  company_id bigint null,
  created_at timestamp with time zone not null default now(),
  tech_stack jsonb null,
  constraint glorri_vacancies_pkey primary key (id),
  constraint glorri_vacancies_company_id_fkey foreign KEY (company_id) references glorri_companies (id) on update CASCADE on delete CASCADE
) TABLESPACE pg_default;

create table public.js_companies (
  id bigint generated by default as identity not null,
  title text not null,
  logo text null,
  logo_mini text null,
  first_char text not null,
  created_at timestamp with time zone not null default now(),
  text text null,
  address text null,
  phones jsonb null,
  sites jsonb null,
  email jsonb null,
  cover text null,
  coordinates jsonb not null,
  constraint companies_pkey primary key (id)
) TABLESPACE pg_default;

create table public.js_vacancies (
  id bigint not null,
  title text not null,
  created_at timestamp with time zone not null,
  slug text not null,
  company_id bigint not null,
  salary bigint null,
  deadline_at timestamp with time zone null,
  text text null,
  tech_stack jsonb null,
  constraint vacancies_pkey primary key (id),
  constraint vacancies_id_key unique (id),
  constraint vacancies_company_id_fkey foreign KEY (company_id) references js_companies (id) on update CASCADE on delete CASCADE
) TABLESPACE pg_default;

create table public.duplicate_jobs (
  glorri_id bigint not null,
  jobsearch_id bigint not null,
  score numeric(6, 4) not null,
  matched_at timestamp with time zone not null default now(),
  constraint duplicate_jobs_pkey primary key (glorri_id),
  constraint duplicate_jobs_jobsearch_id_key unique (jobsearch_id),
  constraint duplicate_jobs_glorri_id_fkey foreign KEY (glorri_id) references glorri_vacancies (id) on update CASCADE on delete CASCADE,
  constraint duplicate_jobs_jobsearch_id_fkey foreign KEY (jobsearch_id) references js_vacancies (id) on update CASCADE on delete CASCADE
) TABLESPACE pg_default;

create table public.vacancy_visits (
  source text not null,
  vacancy_id bigint not null,
  visit_day date not null,
  visitor_hash text not null,
  slug text not null default '',
  created_at timestamp with time zone not null default now(),
  constraint vacancy_visits_pkey primary key (source, vacancy_id, visit_day, visitor_hash),
  constraint vacancy_visits_source_check check (source in ('jobsearch', 'glorri'))
) TABLESPACE pg_default;

create or replace function public.increment_vacancy_visit(
  p_source text,
  p_vacancy_id bigint,
  p_slug text default '',
  p_visitor_ip text default ''
)
returns void
language plpgsql
security definer
set search_path = public
as $$
declare
  normalized_source text := lower(btrim(coalesce(p_source, '')));
  normalized_slug text := btrim(coalesce(p_slug, ''));
  normalized_ip text := btrim(coalesce(p_visitor_ip, ''));
  normalized_visit_day date := (now() at time zone 'utc')::date;
  hashed_visitor text;
begin
  if normalized_source not in ('jobsearch', 'glorri') then
    raise exception 'Invalid source: %', p_source;
  end if;

  if p_vacancy_id is null or p_vacancy_id <= 0 then
    raise exception 'Invalid vacancy id: %', p_vacancy_id;
  end if;

  if normalized_ip = '' then
    return;
  end if;

  hashed_visitor := md5(normalized_ip);

  insert into public.vacancy_visits (
    source,
    vacancy_id,
    visit_day,
    visitor_hash,
    slug,
    created_at
  )
  values (
    normalized_source,
    p_vacancy_id,
    normalized_visit_day,
    hashed_visitor,
    normalized_slug,
    now()
  )
  on conflict (source, vacancy_id, visit_day, visitor_hash)
  do nothing;
end;
$$;

create table public.vacancy_clicks (
  id bigint generated by default as identity not null,
  source text not null,
  vacancy_id bigint not null,
  slug text not null default '',
  visitor_hash text not null default '',
  target_url text not null default '',
  clicked_at timestamp with time zone not null default now(),
  constraint vacancy_clicks_pkey primary key (id),
  constraint vacancy_clicks_source_check check (source in ('jobsearch', 'glorri'))
) TABLESPACE pg_default;

Supabase RLS (public read only):

Run this in Supabase SQL Editor:

alter table public.glorri_companies enable row level security;
alter table public.glorri_vacancies enable row level security;
alter table public.js_companies enable row level security;
alter table public.js_vacancies enable row level security;
alter table public.duplicate_jobs enable row level security;
alter table public.vacancy_visits enable row level security;
alter table public.vacancy_clicks enable row level security;

drop policy if exists "Enable read access for all users" on public.glorri_companies;
create policy "Enable read access for all users"
on public.glorri_companies
for select
to public
using (true);

drop policy if exists "Enable read access for all users" on public.glorri_vacancies;
create policy "Enable read access for all users"
on public.glorri_vacancies
for select
to public
using (true);

drop policy if exists "Enable read access for all users" on public.js_companies;
create policy "Enable read access for all users"
on public.js_companies
for select
to public
using (true);

drop policy if exists "Enable read access for all users" on public.js_vacancies;
create policy "Enable read access for all users"
on public.js_vacancies
for select
to public
using (true);

drop policy if exists "Enable read access for all users" on public.duplicate_jobs;
create policy "Enable read access for all users"
on public.duplicate_jobs
for select
to public
using (true);

revoke all on function public.increment_vacancy_visit(text, bigint, text, text) from public;
grant execute on function public.increment_vacancy_visit(text, bigint, text, text) to service_role;

vacancy_visits and vacancy_clicks intentionally have no public read/write policy. They are written from the server using the service role. views_count now represents unique visitor IPs per vacancy per UTC day (not raw hits).

About

Resources

License

Stars

Watchers

Forks

Contributors