本文为看雪论坛优秀文章
看雪论坛作者ID:e*16 a
以下是基于linux0.11的代码。
一
内核的五大结构
二
中断工作流程
1、ARM回忆
(1)做CPU工作模式的转化
(2)进行寄存器的拷贝与压栈
(3)设置中断向量表
(4)保存正常运行的函数返回值
(5)跳转到对应的中断服务函数上运行
(6)进行模式的复原及寄存器的复原
(7)跳转回正常工作的函数地址继续运行
2、linux中中断的工作流程
(1)将所有寄存器值入栈
(2)将异常吗入栈(中断号)
(3)将当前函数的返回地址入栈
(4)调用中断函数
(5)返回地址出栈
(6)寄存器值出栈
3、中断源码
中断前后的处理 中断的执行
硬件中断的处理过程 asm.s trap.c
软件及系统调用的处理过程 system_call.s fork.c/signal.c/exit.c/sys.c
① asm.s代码及trap.c分析 (OPENING)
② system_call.s代码及fork.c/signal.c/exit.c/sys.c分析
(1) fork.c
在system_call.s内有存在fork的系统调用,先call _find_empty_process,然后call _copy_process。
2
_sys_fork:
call _find_empty_process
testl %eax,%eax
js 1f
push %gs
pushl %esi
pushl %edi
pushl %ebp
pushl %eax
call _copy_process
addl $20,%esp
1: ret
extern void write_verify(unsigned long address);
long last_pid=0;
void verify_area(void * addr,int size)
{
unsigned long start;
start = (unsigned long) addr;
size += start & 0xfff;
start &= 0xfffff000;
start += get_base(current->ldt[2]);
while (size>0) {
size -= 4096;
write_verify(start);
start += 4096;
}
}
int copy_mem(int nr,struct task_struct * p)
{
unsigned long old_data_base,new_data_base,data_limit;
unsigned long old_code_base,new_code_base,code_limit;
code_limit=get_limit(0x0f);
data_limit=get_limit(0x17);
old_code_base = get_base(current->ldt[1]);
old_data_base = get_base(current->ldt[2]);
if (old_data_base != old_code_base)
panic("We don't support separate I&D");
if (data_limit < code_limit)
panic("Bad data_limit");
new_data_base = new_code_base = nr * 0x4000000;
p->start_code = new_code_base;
set_base(p->ldt[1],new_code_base);
set_base(p->ldt[2],new_data_base);
if (copy_page_tables(old_data_base,new_data_base,data_limit)) {
free_page_tables(new_data_base,data_limit);
return -ENOMEM;
}
return 0;
}
/*
* Ok, this is the main fork-routine. It copies the system process
* information (task[nr]) and sets up the necessary registers. It
* also copies the data segment in it's entirety.
*/
int copy_process(int nr,long ebp,long edi,long esi,long gs,long none,
long ebx,long ecx,long edx,
long fs,long es,long ds,
long eip,long cs,long eflags,long esp,long ss)
{
struct task_struct *p; //创建子进程的task_struct结构体
int i;
struct file *f;
p = (struct task_struct *) get_free_page();
if (!p)
return -EAGAIN;
task[nr] = p; //将子进程存到task链表中
*p = *current; /* NOTE! this doesn't copy the supervisor stack */
//下面开始设置结构体内容
p->state = TASK_UNINTERRUPTIBLE;
p->pid = last_pid;
p->father = current->pid;
p->counter = p->priority;
p->signal = 0;
p->alarm = 0;
p->leader = 0; /* process leadership doesn't inherit */
p->utime = p->stime = 0;
p->cutime = p->cstime = 0;
p->start_time = jiffies;
p->tss.back_link = 0;
p->tss.esp0 = PAGE_SIZE + (long) p;
p->tss.ss0 = 0x10;
p->tss.eip = eip;
p->tss.eflags = eflags;
p->tss.eax = 0;
p->tss.ecx = ecx;
p->tss.edx = edx;
p->tss.ebx = ebx;
p->tss.esp = esp;
p->tss.ebp = ebp;
p->tss.esi = esi;
p->tss.edi = edi;
p->tss.es = es & 0xffff;
p->tss.cs = cs & 0xffff;
p->tss.ss = ss & 0xffff;
p->tss.ds = ds & 0xffff;
p->tss.fs = fs & 0xffff;
p->tss.gs = gs & 0xffff;
p->tss.ldt = _LDT(nr);
p->tss.trace_bitmap = 0x80000000;
if (last_task_used_math == current)
__asm__("clts ; fnsave %0"::"m" (p->tss.i387)); //如果父进程用了协处理器,需要在tss段进行设置
if (copy_mem(nr,p)) { //内存拷贝
task[nr] = NULL;
free_page((long) p);
return -EAGAIN;
}
for (i=0; i<NR_OPEN;i++)
if (f=p->filp[i])
f->f_count++;
if (current->pwd)
current->pwd->i_count++;
if (current->root)
current->root->i_count++;
if (current->executable)
current->executable->i_count++;
set_tss_desc(gdt+(nr<<1)+FIRST_TSS_ENTRY,&(p->tss));
set_ldt_desc(gdt+(nr<<1)+FIRST_LDT_ENTRY,&(p->ldt));
p->state = TASK_RUNNING; /* do this last, just in case */
return last_pid;
}
int find_empty_process(void)
{
int i;
repeat:
if ((++last_pid)<0) last_pid=1;
for(i=0 ; i<NR_TASKS ; i++)
if (task[i] && task[i]->pid == last_pid) goto repeat;
for(i=1 ; i<NR_TASKS ; i++)
if (!task[i])
return i;
return -EAGAIN;
}
① 在task链表中找一个进程空位存放
② 创建一个task_struct
③ 设置task_struct
(2)signal.c
这里只是进行一个简单的分析,详细分析请见第五章。
volatile void do_exit(int error_code);
int sys_sgetmask()
{
return current->blocked;
}
int sys_ssetmask(int newmask)
{
int old=current->blocked;
current->blocked = newmask & ~(1<<(SIGKILL-1));
return old;
}
static inline void save_old(char * from,char * to)
{
int i;
verify_area(to, sizeof(struct sigaction));
for (i=0 ; i< sizeof(struct sigaction) ; i++) {
put_fs_byte(*from,to);
from++;
to++;
}
}
static inline void get_new(char * from,char * to)
{
int i;
for (i=0 ; i< sizeof(struct sigaction) ; i++)
*(to++) = get_fs_byte(from++);
}
int sys_signal(int signum, long handler, long restorer)
{
struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL) //判断信号值是否合法
return -1;
tmp.sa_handler = (void (*)(int)) handler;
tmp.sa_mask = 0;
tmp.sa_flags = SA_ONESHOT | SA_NOMASK;
tmp.sa_restorer = (void (*)(void)) restorer; //设置sigaction结构体
handler = (long) current->sigaction[signum-1].sa_handler;
current->sigaction[signum-1] = tmp; //将当前进程对应的信号结构体改为新分配的结构体
return handler; //返回处理函数
}
int sys_sigaction(int signum, const struct sigaction * action,
struct sigaction * oldaction)
{
struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL)
return -1;
tmp = current->sigaction[signum-1];
get_new((char *) action,
(char *) (signum-1+current->sigaction));
if (oldaction)
save_old((char *) &tmp,(char *) oldaction);
if (current->sigaction[signum-1].sa_flags & SA_NOMASK)
current->sigaction[signum-1].sa_mask = 0;
else
current->sigaction[signum-1].sa_mask |= (1<<(signum-1));
return 0;
}
void do_signal(long signr,long eax, long ebx, long ecx, long edx,
long fs, long es, long ds,
long eip, long cs, long eflags,
unsigned long * esp, long ss)
{
unsigned long sa_handler;
long old_eip=eip;
struct sigaction * sa = current->sigaction + signr - 1;
int longs;
unsigned long * tmp_esp;
sa_handler = (unsigned long) sa->sa_handler;
if (sa_handler==1)
return;
if (!sa_handler) {
if (signr==SIGCHLD)
return;
else
do_exit(1<<(signr-1));
}
if (sa->sa_flags & SA_ONESHOT)
sa->sa_handler = NULL;
*(&eip) = sa_handler;
longs = (sa->sa_flags & SA_NOMASK)?7:8;
*(&esp) -= longs;
verify_area(esp,longs*4);
tmp_esp=esp;
put_fs_long((long) sa->sa_restorer,tmp_esp++);
put_fs_long(signr,tmp_esp++);
if (!(sa->sa_flags & SA_NOMASK))
put_fs_long(current->blocked,tmp_esp++);
put_fs_long(eax,tmp_esp++);
put_fs_long(ecx,tmp_esp++);
put_fs_long(edx,tmp_esp++);
put_fs_long(eflags,tmp_esp++);
put_fs_long(old_eip,tmp_esp++);
current->blocked |= sa->sa_mask;
}
// Line 12
// Line 37
// Line 45
typedef unsigned int sigset_t;
struct sigaction {
void (*sa_handler)(int); // 信号处理程序指针
sigset_t sa_mask; // 指出当前信号处理程序执行期间需要被屏蔽的信号
int sa_flags; // 从 37 行的三个定义中选出
void (*sa_restorer)(void); // 恢复函数指针,由 libc 提供
};
(3)exit.c
int sys_pause(void);
int sys_close(int fd);
void release(struct task_struct * p) //释放进程p
{
int i;
if (!p)
return;
for (i=1 ; i<NR_TASKS ; i++)
if (task[i]==p) {
task[i]=NULL;
free_page((long)p); //释放内存页
schedule(); //之后重新进行进程调度
return;
}
panic("trying to release non-existent task");
}
static inline int send_sig(long sig,struct task_struct * p,int priv)
{
if (!p || sig<1 || sig>32)
return -EINVAL;
if (priv || (current->euid==p->euid) || suser())
p->signal |= (1<<(sig-1)); //给p进程发送信号
else
return -EPERM;
return 0;
}
static void kill_session(void) //关闭对话函数
{
struct task_struct **p = NR_TASKS + task; //获得task数组最后一个任务
while (--p > &FIRST_TASK) { //从最后一个向前遍历
if (*p && (*p)->session == current->session) //如果遍历到当前的任务
(*p)->signal |= 1<<(SIGHUP-1); //则将SIGHUP挂断信号发送给当前任务
}
}
/*
* XXX need to check permissions needed to send signals to process
* groups, etc. etc. kill() permissions semantics are tricky!
*/
int sys_kill(int pid,int sig) //linux命令kill不是杀死的意思,是向某进程发送任何信号
{
struct task_struct **p = NR_TASKS + task; //指向最后
int err, retval = 0;
// 注: 每个进程组都有一个组长进程,组长进程的进程号等于进程组ID
if (!pid) while (--p > &FIRST_TASK) { //如果pid为0,进入循环
if (*p && (*p)->pgrp == current->pid) //向进程组的所有成员发送信号
if (err=send_sig(sig,*p,1))
retval = err;
}
else if (pid>0) while (--p > &FIRST_TASK) { //如果pid大于0
if (*p && (*p)->pid == pid) //仅向pid进程发送信号
if (err=send_sig(sig,*p,0))
retval = err;
}
else if (pid == -1) while (--p > &FIRST_TASK) //如果pid=-1
if (err = send_sig(sig,*p,0)) //向除0号进程外的进程发送信号
retval = err;
else while (--p > &FIRST_TASK) //如果pid<-1
if (*p && (*p)->pgrp == -pid) //向进程组号为-pid的进程组发送信号
if (err = send_sig(sig,*p,0))
retval = err;
return retval;
}
static void tell_father(int pid) //传入参数为父进程的pid
{
int i;
if (pid)
for (i=0;i<NR_TASKS;i++) {
if (!task[i])
continue;
if (task[i]->pid != pid)
continue;
task[i]->signal |= (1<<(SIGCHLD-1)); //SIGCHLD=17
return;
}
/* if we don't find any fathers, we just release ourselves */
/* This is not really OK. Must change it to make father 1 */
printk("BAD BAD - no father foundnr");
release(current); //释放子进程
}
int do_exit(long code)
{
int i;
//#define LDT_NUL 0
//#define LDT_CODE 1
//#define LDT_DATA 2
free_page_tables(get_base(current->ldt[1]),get_limit(0x0f)); //释放当前进程的CODE段所占用的内存页
free_page_tables(get_base(current->ldt[2]),get_limit(0x17));
for (i=0 ; i<NR_TASKS ; i++) //从前向后遍历
if (task[i] && task[i]->father == current->pid) { //若当前进程就是某个进程的父进程时;
task[i]->father = 1; //就让1号进程作为某个进程的父进程(因为current这个进程将会exit)
if (task[i]->state == TASK_ZOMBIE) //若某进程是僵死状态
/* assumption task[1] is always init */
(void) send_sig(SIGCHLD, task[1], 1); //给1号进程发送信号
}
for (i=0 ; i<NR_OPEN ; i++)
if (current->filp[i]) //关闭当前进程打开的所有文件
sys_close(i);
iput(current->pwd); //把当前进程的路径放回i节点并置空
current->pwd=NULL;
iput(current->root);
current->root=NULL;
iput(current->executable);
current->executable=NULL;
if (current->leader && current->tty >= 0) //若当前进程是进程组的头头,并且拥有tty终端
tty_table[current->tty].pgrp = 0; //释放该终端
if (last_task_used_math == current)
last_task_used_math = NULL;
if (current->leader)
kill_session(); //关闭session
current->state = TASK_ZOMBIE; //设置成僵死状态
current->exit_code = code;
tell_father(current->father); //向当前进程的父进程发送 SIGCHLD 信号
schedule();
return (-1); /* just to suppress warnings */
}
int sys_exit(int error_code)
{
return do_exit((error_code&0xff)<<8);
}
int sys_waitpid(pid_t pid,unsigned long * stat_addr, int options)
{
int flag, code;
struct task_struct ** p;
verify_area(stat_addr,4);
repeat:
flag=0;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) {
if (!*p || *p == current) //若该项为空或者该项是当前进程,则跳过
continue;
if ((*p)->father != current->pid) //若该项的父进程不是当前进程,则跳过
continue;
if (pid>0) { //若pid>0
if ((*p)->pid != pid) //若该项的pid不是waitpid传进来的pid参数,则跳过
continue;
} else if (!pid) { //若pid=0,
if ((*p)->pgrp != current->pgrp) //若当前项不在当前进程组,则跳过
continue;
} else if (pid != -1) { //若pid<-1
if ((*p)->pgrp != -pid) //若当前项不在-pid的进程组,则跳过
continue;
}
switch ((*p)->state) { //若pid=-1,则直接来到switch;判断所选进程p的状态
case TASK_STOPPED: //若是停止状态
if (!(options & WUNTRACED)) //
continue;
put_fs_long(0x7f,stat_addr);
return (*p)->pid;
case TASK_ZOMBIE:
current->cutime += (*p)->utime;
current->cstime += (*p)->stime;
flag = (*p)->pid;
code = (*p)->exit_code;
release(*p);
put_fs_long(code,stat_addr);
return flag;
default: //p是睡眠或运行状态,设置flag为1
flag=1;
continue;
}
}
if (flag) {
if (options & WNOHANG) //WNOHANG 表示若没有子进程处于退出或终止态就返回
return 0;
current->state=TASK_INTERRUPTIBLE; //否则将当前进程的状态置为可中断睡眠态
schedule();
if (!(current->signal &= ~(1<<(SIGCHLD-1))))
goto repeat;
else
return -EINTR;
}
return -ECHILD;
}
do_exit()
① 释放进程的代码段和数据段占用的内存。
② 关闭进程打开的所有文件,对当前目录和i节点进行同步(文件操作)。
③ 如果当前要销毁的进程有子进程,就让1号进程作为新的父进程。
④ 如果当前进程是一个会话头进程,则会终止会话中的所有进程。
⑤ 改变当前进程的运行状态,变成TASK_ZOMBIE(僵死)状态,并且向其父进程发送SIGCHLD信号,说明自己要死了。
sys_waitpid()
① 父进程在运行子进程时一般都会运行wait waitpid这两个函数,用来父进程等待子进程终止。
② 当父进程收到SIGCHLD信号时,父进程会终止僵死状态的子进程。
③ 父进程会把子进程的运行时间累加到自己的运行时间上。
④ 把对应子进程的进程描述结构体进行释放,置空数组空槽。
三
进程
1.内核进程初始化与创建
每创建一个进程就对应着一个task_struct结构体。
struct task_struct {
/* these are hardcoded - don't touch */
long state; /* -1 unrunnable, 0 runnable, >0 stopped */
long counter;
long priority;
long signal;
struct sigaction sigaction[32];
long blocked; /* bitmap of masked signals */
/* various fields */
int exit_code;
unsigned long start_code,end_code,end_data,brk,start_stack;
long pid,father,pgrp,session,leader;
unsigned short uid,euid,suid;
unsigned short gid,egid,sgid;
long alarm;
long utime,stime,cutime,cstime,start_time;
unsigned short used_math;
/* file system info */
int tty; /* -1 if no tty, so it must be signed */
unsigned short umask;
struct m_inode * pwd;
struct m_inode * root;
struct m_inode * executable;
unsigned long close_on_exec;
struct file * filp[NR_OPEN];
/* ldt for this task 0 - zero 1 - cs 2 - ds&ss */
struct desc_struct ldt[3];
/* tss for this task */
struct tss_struct tss; //cpu运行一个进程后各个寄存器都保存在tss内
};
(1)0号和1号进程的创建
Linux在初始化的过程中会进行0号进程的创建。
注:分析0.11的main函数
void main(void) /* This really IS void, no error here. */
{ /* The startup routine assumes (well, ...) this */
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
ROOT_DEV = ORIG_ROOT_DEV;
drive_info = DRIVE_INFO;
memory_end = (1<<20) + (EXT_MEM_K<<10);
memory_end &= 0xfffff000;
if (memory_end > 16*1024*1024)
memory_end = 16*1024*1024;
if (memory_end > 12*1024*1024)
buffer_memory_end = 4*1024*1024;
else if (memory_end > 6*1024*1024)
buffer_memory_end = 2*1024*1024;
else
buffer_memory_end = 1*1024*1024;
main_memory_start = buffer_memory_end;
main_memory_start += rd_init(main_memory_start, RAMDISK*1024);
mem_init(main_memory_start,memory_end);
trap_init();
blk_dev_init();
chr_dev_init();
tty_init();
time_init();
sched_init();
buffer_init(buffer_memory_end);
hd_init();
floppy_init();
sti();
move_to_user_mode(); //切换到用户态
if (!fork()) { /* 创建0号进程 */
init();
}
for(;;) pause();
}
内核要先切换到用户态之后再fork生成0号进程。
#define move_to_user_mode()
__asm__ ("movl %%esp,%%eaxnt"
"pushl $0x17nt"
"pushl %%eaxnt"
"pushflnt"
"pushl $0x0fnt"
"pushl $1fnt"
"iretn"
"1:tmovl $0x17,%%eaxnt"
"movw %%ax,%%dsnt"
"movw %%ax,%%esnt"
"movw %%ax,%%fsnt"
"movw %%ax,%%gs"
:::"ax")
iret是从中断返回的指令,在iret之前,之前5个push压入的数据会出栈,分别赋给ss,esp,eflags,cs,eip。
fork生成0进程之后,会进行初始化,进一步分析如下:
static char * argv_rc[] = { "/bin/sh", NULL };
static char * envp_rc[] = { "HOME=/", NULL };
static char * argv[] = { "-/bin/sh",NULL };
static char * envp[] = { "HOME=/usr/root", NULL };
void init(void)
{
int pid,i;
setup((void *) &drive_info);
(void) open("/dev/tty0",O_RDWR,0); //tty0设备是标准输入控制台,句柄为0
(void) dup(0);
(void) dup(0);
printf("%d buffers = %d bytes buffer spacenr",NR_BUFFERS,
NR_BUFFERS*BLOCK_SIZE);
printf("Free mem: %d bytesnr",memory_end-main_memory_start);
if (!(pid=fork())) { //对于被创建的子进程,返回值为0,所以if里面的语句是在子进程中执行,并打开rc文件并用获得的shell在/执行rc里的命令
close(0); //关闭标准输入,所有进程共用文件描述符
if (open("/etc/rc",O_RDONLY,0))
_exit(1);
execve("/bin/sh",argv_rc,envp_rc);
_exit(2);
}
if (pid>0) //fork后对于父进程来说,返回的是子进程的进程号,即if语句内是父进程要执行的代码
while (pid != wait(&i)) //等待子进程退出
/* nothing */;
while (1) { //如果执行到了这里,就说明子进程已经创建完成退出或者终止,下面是再创建一个子进程,
if ((pid=fork())<0) {
printf("Fork failed in initrn");
continue;
}
if (!pid) { //创建成功
close(0);close(1);close(2);
setsid();
(void) open("/dev/tty0",O_RDWR,0);
(void) dup(0);
(void) dup(0);
_exit(execve("/bin/sh",argv,envp));
}
while (1)
if (pid == wait(&i))
break;
printf("nrchild %d died with code %04xnr",pid,i);
sync();
}
_exit(0); /* NOTE! _exit, not exit() */
}
① 0号进程打开标准输入输出错误句柄
② 创建1号进程,首先打开”/dev/rc”文件,执行shell
③ 如果1号进程创建失败,会换一种方式再次创建
④ 之后就是进行pause()暂停状态,系统等待运行下一步
####
2、普通进程的创建(WORKING)
众所周知每创建一个进程都会创建一个相对应的task_struct结构体,task结构体里就有代表该进程唯一的PID。
3、进程的调度与切换
这是Sched.c函数。
void show_task(int nr,struct task_struct * p) //nr就是pid
{
int i,j = 4096-sizeof(struct task_struct);
printk("%d: pid=%d, state=%d, ",nr,p->pid,p->state); //打印pid与state
i=0;
while (i<j && !((char *)(p+1))[i])
i++;
printk("%d (of %d) chars free in kernel stacknr",i,j); //打印栈
}
void show_stat(void)
{
int i;
for (i=0;i<NR_TASKS;i++)
if (task[i])
show_task(i,task[i]);
}
extern void mem_use(void);
extern int timer_interrupt(void);
extern int system_call(void);
union task_union {
struct task_struct task;
char stack[PAGE_SIZE];
};
static union task_union init_task = {INIT_TASK,};
long volatile jiffies=0;
long startup_time=0;
struct task_struct *current = &(init_task.task);
struct task_struct *last_task_used_math = NULL;
struct task_struct * task[NR_TASKS] = {&(init_task.task), };
long user_stack [ PAGE_SIZE>>2 ] ;
struct {
long * a;
short b;
} stack_start = { & user_stack [PAGE_SIZE>>2] , 0x10 };
/*
* 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*/
void math_state_restore() //进程切换时完成协处理器中寄存器的切换
{
if (last_task_used_math == current)
return;
__asm__("fwait");
if (last_task_used_math) {
__asm__("fnsave %0"::"m" (last_task_used_math->tss.i387));
}
last_task_used_math=current;
if (current->used_math) {
__asm__("frstor %0"::"m" (current->tss.i387));
} else {
__asm__("fninit"::);
current->used_math=1;
}
}
/*
* 'schedule()' is the scheduler function. This is GOOD CODE! There
* probably won't be any reason to change this, as it should work well
* in all circumstances (ie gives IO-bound processes good response etc).
* The one thing you might take a look at is the signal-handler code here.
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
void schedule(void)
{
int i,next,c;
struct task_struct ** p;
/* check alarm, wake up any interruptible tasks that have got a signal */
/*
#define TASK_RUNNING 0 只有state是0时,该进程才会被运行,或进入就绪队列
#define TASK_INTERRUPTIBLE 1 可中断睡眠状态 可以被信号中断,变成running状态
#define TASK_UNINTERRUPTIBLE 2 不可中断睡眠状态 只能被wakeup函数唤醒,变成running状态
#define TASK_ZOMBIE 3 僵死状态 进程停止运行,但是其task_struct未被清空
#define TASK_STOPPED 4 暂停状态
*/
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p) //从后往前遍历
if (*p) { //若进程存在
if ((*p)->alarm && (*p)->alarm < jiffies) { //若alarm不为空且小于jiffies(此处是0)
(*p)->signal |= (1<<(SIGALRM-1));
(*p)->alarm = 0;
}
if (((*p)->signal & ~(_BLOCKABLE & (*p)->blocked)) && //进程不理会某些信号;并且进程是可中断睡眠状态
(*p)->state==TASK_INTERRUPTIBLE)
(*p)->state=TASK_RUNNING;
}
/* this is the scheduler proper: */
while (1) { //进行counter的比较,来决定进程的调用
c = -1;
next = 0;
i = NR_TASKS;
p = &task[NR_TASKS];
while (--i) {
if (!*--p)
continue;
if ((*p)->state == TASK_RUNNING && (*p)->counter > c)
c = (*p)->counter, next = i; //遍历之后,会将counter的最大值赋给c,并且next存着最大counter的pid
}
if (c) break;
for(p = &LAST_TASK ; p > &FIRST_TASK ; --p)
if (*p)
(*p)->counter = ((*p)->counter >> 1) + //counter = counter/2 + priority
(*p)->priority;
}
switch_to(next); //进程切换
}
/*
这部分代码的目的是在所有就绪状态的任务进程中筛选出counter值最大的进程ID。之后如果counter值不为0则进入调度这个进程执行,如果counter值为0,则说明所有就绪状态的进程的时间片都已用完,需要重新调整所有进程的时间片。
*/
/*#define switch_to(n) {
struct {long a,b;} __tmp;
__asm__("cmpl %%ecx,_currentnt"
"je 1fnt"
"movw %%dx,%1nt"
"xchgl %%ecx,_currentnt"
"ljmp %0nt"
"cmpl %%ecx,_last_task_used_mathnt"
"jne 1fnt"
"cltsn"
"1:"
::"m" (*&__tmp.a),"m" (*&__tmp.b),
"d" (_TSS(n)),"c" ((long) task[n]));
}
*/
int sys_pause(void)
{
current->state = TASK_INTERRUPTIBLE;
schedule();
return 0;
}
void sleep_on(struct task_struct **p) //当p进程想访问cpu的某个资源,但是该资源被占用;
{
struct task_struct *tmp;
if (!p)
return;
if (current == &(init_task.task)) //如果当前进程为0号进程时,就返回,不能sleep
panic("task[0] trying to sleep");
tmp = *p;
*p = current; //将p赋为当前进程
current->state = TASK_UNINTERRUPTIBLE;
schedule();
if (tmp)
tmp->state=0;
}
void interruptible_sleep_on(struct task_struct **p)
{
struct task_struct *tmp;
if (!p)
return;
if (current == &(init_task.task))
panic("task[0] trying to sleep");
tmp=*p;
*p=current;
repeat: current->state = TASK_INTERRUPTIBLE;
schedule();
if (*p && *p != current) {
(**p).state=0;
goto repeat;
}
*p=NULL;
if (tmp)
tmp->state=0;
}
void wake_up(struct task_struct **p)
{
if (p && *p) {
(**p).state=0;
*p=NULL;
}
}
/*
* OK, here are some floppy things that shouldn't be in the kernel
* proper. They are here because the floppy needs a timer, and this
* was the easiest way of doing it.
*/
static struct task_struct * wait_motor[4] = {NULL,NULL,NULL,NULL};
static int mon_timer[4]={0,0,0,0};
static int moff_timer[4]={0,0,0,0};
unsigned char current_DOR = 0x0C;
int ticks_to_floppy_on(unsigned int nr)
{
extern unsigned char selected;
unsigned char mask = 0x10 << nr;
if (nr>3)
panic("floppy_on: nr>3");
moff_timer[nr]=10000; /* 100 s = very big :-) */
cli(); /* use floppy_off to turn it off */
mask |= current_DOR;
if (!selected) {
mask &= 0xFC;
mask |= nr;
}
if (mask != current_DOR) {
outb(mask,FD_DOR);
if ((mask ^ current_DOR) & 0xf0)
mon_timer[nr] = HZ/2;
else if (mon_timer[nr] < 2)
mon_timer[nr] = 2;
current_DOR = mask;
}
sti();
return mon_timer[nr];
}
void floppy_on(unsigned int nr)
{
cli();
while (ticks_to_floppy_on(nr))
sleep_on(nr+wait_motor);
sti();
}
void floppy_off(unsigned int nr)
{
moff_timer[nr]=3*HZ;
}
void do_floppy_timer(void)
{
int i;
unsigned char mask = 0x10;
for (i=0 ; i<4 ; i++,mask <<= 1) {
if (!(mask & current_DOR))
continue;
if (mon_timer[i]) {
if (!--mon_timer[i])
wake_up(i+wait_motor);
} else if (!moff_timer[i]) {
current_DOR &= ~mask;
outb(current_DOR,FD_DOR);
} else
moff_timer[i]--;
}
}
static struct timer_list {
long jiffies;
void (*fn)();
struct timer_list * next;
} timer_list[TIME_REQUESTS], * next_timer = NULL;
void add_timer(long jiffies, void (*fn)(void))
{
struct timer_list * p;
if (!fn)
return;
cli();
if (jiffies <= 0)
(fn)();
else {
for (p = timer_list ; p < timer_list + TIME_REQUESTS ; p++)
if (!p->fn)
break;
if (p >= timer_list + TIME_REQUESTS)
panic("No more time requests free");
p->fn = fn;
p->jiffies = jiffies;
p->next = next_timer;
next_timer = p;
while (p->next && p->next->jiffies < p->jiffies) {
p->jiffies -= p->next->jiffies;
fn = p->fn;
p->fn = p->next->fn;
p->next->fn = fn;
jiffies = p->jiffies;
p->jiffies = p->next->jiffies;
p->next->jiffies = jiffies;
p = p->next;
}
}
sti();
}
void do_timer(long cpl)
{
extern int beepcount;
extern void sysbeepstop(void);
if (beepcount)
if (!--beepcount)
sysbeepstop();
if (cpl)
current->utime++;
else
current->stime++;
if (next_timer) {
next_timer->jiffies--;
while (next_timer && next_timer->jiffies <= 0) {
void (*fn)(void);
fn = next_timer->fn;
next_timer->fn = NULL;
next_timer = next_timer->next;
(fn)();
}
}
if (current_DOR & 0xf0)
do_floppy_timer();
if ((--current->counter)>0) return;
current->counter=0;
if (!cpl) return;
schedule();
}
int sys_alarm(long seconds)
{
int old = current->alarm;
if (old)
old = (old - jiffies) / HZ;
current->alarm = (seconds>0)?(jiffies+HZ*seconds):0;
return (old);
}
int sys_getpid(void)
{
return current->pid;
}
int sys_getppid(void)
{
return current->father;
}
int sys_getuid(void)
{
return current->uid;
}
int sys_geteuid(void)
{
return current->euid;
}
int sys_getgid(void)
{
return current->gid;
}
int sys_getegid(void)
{
return current->egid;
}
int sys_nice(long increment)
{
if (current->priority-increment>0)
current->priority -= increment;
return 0;
}
void sched_init(void)
{
int i;
struct desc_struct * p;
if (sizeof(struct sigaction) != 16)
panic("Struct sigaction MUST be 16 bytes");
set_tss_desc(gdt+FIRST_TSS_ENTRY,&(init_task.task.tss));
set_ldt_desc(gdt+FIRST_LDT_ENTRY,&(init_task.task.ldt));
p = gdt+2+FIRST_TSS_ENTRY;
for(i=1;i<NR_TASKS;i++) {
task[i] = NULL;
p->a=p->b=0;
p++;
p->a=p->b=0;
p++;
}
/* Clear NT, so that we won't have troubles with that later on */
__asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
ltr(0);
lldt(0);
outb_p(0x36,0x43); /* binary, mode 3, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff , 0x40); /* LSB */
outb(LATCH >> 8 , 0x40); /* MSB */
set_intr_gate(0x20,&timer_interrupt);
outb(inb_p(0x21)&~0x01,0x21);
set_system_gate(0x80,&system_call);
}
4、进程的销毁
就是exit.c函数的操作。
5、进程间的通信(WORKING)
(1)进程,线程
创建一个进程之后,就会对应一个task_struct结构体,fork之后,会进行写实复制(Copy-On-Write),也就是说子进程和父进程的内容大部分是一致的。
问:一个进程多个线程的调度方式和一个进程一个线程时的调度方式有什么区别?
答:没有区别,内核中线程和进程都需要do_fork来实现,所以没有区别。
四
操作系统的引导与启动
1、BIOS/Bootloader:
由PC机的BIOS(0xFFFF0是BIOS存储的总线地址)把bootsect从某个固定的地址拿到了内存中的某个固定地址(0x90000),并且进行了一系列的硬件初始化和参数设置。
2、bootsect.s(WORKING)
磁盘引导块程序,在磁盘的第一个扇区中的程序(0磁道,0磁头,1扇区)。
作用:首先将后续的setup.s代码从磁盘中加载到紧接着bootsect.s的地方,在显示屏上显示loading system ,再将操作系统加载到0x10000,最后转到setup.s运行。
! SYS_SIZE is the number of clicks (16 bytes) to be loaded.
! 0x3000 is 0x30000 bytes = 196kB, more than enough for current
! versions of linux
!
SYSSIZE = 0x3000
!
! bootsect.s (C) 1991 Linus Torvalds
!
! bootsect.s is loaded at 0x7c00 by the bios-startup routines, and moves
! iself out of the way to address 0x90000, and jumps there.
!
! It then loads 'setup' directly after itself (0x90200), and the system
! at 0x10000, using BIOS interrupts.
!
! NOTE! currently system is at most 8*65536 bytes long. This should be no
! problem, even in the future. I want to keep it simple. This 512 kB
! kernel size should be enough, especially as this doesn't contain the
! buffer cache as in minix
!
! The loader has been made as simple as possible, and continuos
! read errors will result in a unbreakable loop. Reboot by hand. It
! loads pretty fast by getting whole sectors at a time whenever possible.
begtext, begdata, begbss, endtext, enddata, endbss
.text
begtext:
.data
begdata:
.bss
begbss:
.text
SETUPLEN = 4 ! nr of setup-sectors
BOOTSEG = 0x07c0 ! original address of boot-sector
INITSEG = 0x9000 ! we move boot here - out of the way
SETUPSEG = 0x9020 ! setup starts here
SYSSEG = 0x1000 ! system loaded at 0x10000 (65536).
ENDSEG = SYSSEG + SYSSIZE ! where to stop loading
! ROOT_DEV: 0x000 - same type of floppy as boot.
! 0x301 - first partition on first drive etc
ROOT_DEV = 0x306
entry start
start:
mov ax,#BOOTSEG
mov ds,ax
mov ax,#INITSEG
mov es,ax
mov cx,#256
sub si,si
sub di,di
rep
movw
jmpi go,INITSEG
go: mov ax,cs
mov ds,ax
mov es,ax
! put stack at 0x9ff00.
mov ss,ax
mov sp,#0xFF00 ! arbitrary value >>512
! load the setup-sectors directly after the bootblock.
! Note that 'es' is already set up.
load_setup:
mov dx,#0x0000 ! drive 0, head 0
mov cx,#0x0002 ! sector 2, track 0
mov bx,#0x0200 ! address = 512, in INITSEG
mov ax,#0x0200+SETUPLEN ! service 2, nr of sectors
int 0x13 ! read it
jnc ok_load_setup ! ok - continue
mov dx,#0x0000
mov ax,#0x0000 ! reset the diskette
int 0x13
j load_setup
ok_load_setup:
! Get disk drive parameters, specifically nr of sectors/track
mov dl,#0x00
mov ax,#0x0800 ! AH=8 is get drive parameters
int 0x13
mov ch,#0x00
seg cs
mov sectors,cx
mov ax,#INITSEG
mov es,ax
! Print some inane message
mov ah,#0x03 ! read cursor pos
xor bh,bh
int 0x10
mov cx,#24
mov bx,#0x0007 ! page 0, attribute 7 (normal)
mov bp,#msg1
mov ax,#0x1301 ! write string, move cursor
int 0x10
! ok, we've written the message, now
! we want to load the system (at 0x10000)
mov ax,#SYSSEG
mov es,ax ! segment of 0x010000
call read_it
call kill_motor
! After that we check which root-device to use. If the device is
! defined (!= 0), nothing is done and the given device is used.
! Otherwise, either /dev/PS0 (2,28) or /dev/at0 (2,8), depending
! on the number of sectors that the BIOS reports currently.
seg cs
mov ax,root_dev
cmp ax,#0
jne root_defined
seg cs
mov bx,sectors
mov ax,#0x0208 ! /dev/ps0 - 1.2Mb
cmp bx,#15
je root_defined
mov ax,#0x021c ! /dev/PS0 - 1.44Mb
cmp bx,#18
je root_defined
undef_root:
jmp undef_root
root_defined:
seg cs
mov root_dev,ax
! after that (everyting loaded), we jump to
! the setup-routine loaded directly after
! the bootblock:
jmpi 0,SETUPSEG
! This routine loads the system at address 0x10000, making sure
! no 64kB boundaries are crossed. We try to load it as fast as
! possible, loading whole tracks whenever we can.
!
! in: es - starting address segment (normally 0x1000)
!
sread: .word 1+SETUPLEN ! sectors read of current track
head: .word 0 ! current head
track: .word 0 ! current track
read_it:
mov ax,es
test ax,#0x0fff
die: jne die ! es must be at 64kB boundary
xor bx,bx ! bx is starting address within segment
rp_read:
mov ax,es
cmp ax,#ENDSEG ! have we loaded all yet?
jb ok1_read
ret
ok1_read:
seg cs
mov ax,sectors
sub ax,sread
mov cx,ax
shl cx,#9
add cx,bx
jnc ok2_read
je ok2_read
xor ax,ax
sub ax,bx
shr ax,#9
ok2_read:
call read_track
mov cx,ax
add ax,sread
seg cs
cmp ax,sectors
jne ok3_read
mov ax,#1
sub ax,head
jne ok4_read
inc track
ok4_read:
mov head,ax
xor ax,ax
ok3_read:
mov sread,ax
shl cx,#9
add bx,cx
jnc rp_read
mov ax,es
add ax,#0x1000
mov es,ax
xor bx,bx
jmp rp_read
read_track:
push ax
push bx
push cx
push dx
mov dx,track
mov cx,sread
inc cx
mov ch,dl
mov dx,head
mov dh,dl
mov dl,#0
and dx,#0x0100
mov ah,#2
int 0x13
jc bad_rt
pop dx
pop cx
pop bx
pop ax
ret
bad_rt: mov ax,#0
mov dx,#0
int 0x13
pop dx
pop cx
pop bx
pop ax
jmp read_track
/*
This procedure turns off the floppy drive motor, so
that we enter the kernel in a known state, and
don't have to worry about it later.
*/
kill_motor:
push dx
mov dx,#0x3f2
mov al,#0
outb
pop dx
ret
sectors:
0
msg1:
13,10
"Loading system ..."
13,10,13,10
508
root_dev:
ROOT_DEV
boot_flag:
0xAA55
.text
endtext:
.data
enddata:
.bss
endbss:
3、setup.s(WORKING)
解析BIOS/Bootloader传进来的参数,设置系统内核运行的LDT(局部描述符),IDT(中断描述符) GDT(全局描述符),设置中断控制芯片,进入保护模式运行;跳转到head.s运行。
(C) 1991 Linus Torvalds
!
! setup.s is responsible for getting the system data from the BIOS,
! and putting them into the appropriate places in system memory.
! both setup.s and system has been loaded by the bootblock.
!
! This code asks the bios for memory/disk/other parameters, and
! puts them in a "safe" place: 0x90000-0x901FF, ie where the
! boot-block used to be. It is then up to the protected mode
! system to read them from there before the area is overwritten
! for buffer-blocks.
!
! NOTE! These had better be the same as in bootsect.s!
INITSEG = 0x9000 ! we move boot here - out of the way
SYSSEG = 0x1000 ! system loaded at 0x10000 (65536).
SETUPSEG = 0x9020 ! this is the current segment
begtext, begdata, begbss, endtext, enddata, endbss
.text
begtext:
.data
begdata:
.bss
begbss:
.text
entry start
start:
! ok, the read went well so we get current cursor position and save it for
! posterity.
mov ax,#INITSEG ! this is done in bootsect already, but...
mov ds,ax
mov ah,#0x03 ! read cursor pos
xor bh,bh
int 0x10 ! save it in known place, con_init fetches
mov [0],dx ! it from 0x90000.
! Get memory size (extended mem, kB)
mov ah,#0x88
int 0x15
mov [2],ax
! Get video-card data:
mov ah,#0x0f
int 0x10
mov [4],bx ! bh = display page
mov [6],ax ! al = video mode, ah = window width
! check for EGA/VGA and some config parameters
mov ah,#0x12
mov bl,#0x10
int 0x10
mov [8],ax
mov [10],bx
mov [12],cx
! Get hd0 data
mov ax,#0x0000
mov ds,ax
lds si,[4*0x41]
mov ax,#INITSEG
mov es,ax
mov di,#0x0080
mov cx,#0x10
rep
movsb
! Get hd1 data
mov ax,#0x0000
mov ds,ax
lds si,[4*0x46]
mov ax,#INITSEG
mov es,ax
mov di,#0x0090
mov cx,#0x10
rep
movsb
! Check that there IS a hd1 :-)
mov ax,#0x01500
mov dl,#0x81
int 0x13
jc no_disk1
cmp ah,#3
je is_disk1
no_disk1:
mov ax,#INITSEG
mov es,ax
mov di,#0x0090
mov cx,#0x10
mov ax,#0x00
rep
stosb
is_disk1:
! now we want to move to protected mode ...
cli ! no interrupts allowed !
! first we move the system to it's rightful place
mov ax,#0x0000
cld ! 'direction'=0, movs moves forward
do_move:
mov es,ax ! destination segment
add ax,#0x1000
cmp ax,#0x9000
jz end_move
mov ds,ax ! source segment
sub di,di
sub si,si
mov cx,#0x8000
rep
movsw
jmp do_move
! then we load the segment descriptors
end_move:
mov ax,#SETUPSEG ! right, forgot this at first. didn't work :-)
mov ds,ax
lidt idt_48 ! load idt with 0,0
lgdt gdt_48 ! load gdt with whatever appropriate
! that was painless, now we enable A20
call empty_8042
mov al,#0xD1 ! command write
out #0x64,al
call empty_8042
mov al,#0xDF ! A20 on
out #0x60,al
call empty_8042
! well, that went ok, I hope. Now we have to reprogram the interrupts :-(
! we put them right after the intel-reserved hardware interrupts, at
! int 0x20-0x2F. There they won't mess up anything. Sadly IBM really
! messed this up with the original PC, and they haven't been able to
! rectify it afterwards. Thus the bios puts interrupts at 0x08-0x0f,
! which is used for the internal hardware interrupts as well. We just
! have to reprogram the 8259's, and it isn't fun.
mov al,#0x11 ! initialization sequence
out #0x20,al ! send it to 8259A-1
0x00eb,0x00eb ! jmp $+2, jmp $+2
out #0xA0,al ! and to 8259A-2
0x00eb,0x00eb
mov al,#0x20 ! start of hardware int's (0x20)
out #0x21,al
0x00eb,0x00eb
mov al,#0x28 ! start of hardware int's 2 (0x28)
out #0xA1,al
0x00eb,0x00eb
mov al,#0x04 ! 8259-1 is master
out #0x21,al
0x00eb,0x00eb
mov al,#0x02 ! 8259-2 is slave
out #0xA1,al
0x00eb,0x00eb
mov al,#0x01 ! 8086 mode for both
out #0x21,al
0x00eb,0x00eb
out #0xA1,al
0x00eb,0x00eb
mov al,#0xFF ! mask off all interrupts for now
out #0x21,al
0x00eb,0x00eb
out #0xA1,al
! well, that certainly wasn't fun :-(. Hopefully it works, and we don't
! need no steenking BIOS anyway (except for the initial loading :-).
! The BIOS-routine wants lots of unnecessary data, and it's less
! "interesting" anyway. This is how REAL programmers do it.
!
! Well, now's the time to actually move into protected mode. To make
! things as simple as possible, we do no register set-up or anything,
! we let the gnu-compiled 32-bit programs do that. We just jump to
! absolute address 0x00000, in 32-bit protected mode.
mov ax,#0x0001 ! protected mode (PE) bit
lmsw ax ! This is it!
jmpi 0,8 ! jmp offset 0 of segment 8 (cs)
! This routine checks that the keyboard command queue is empty
! No timeout is used - if this hangs there is something wrong with
! the machine, and we probably couldn't proceed anyway.
empty_8042:
0x00eb,0x00eb
in al,#0x64 ! 8042 status port
test al,#2 ! is input buffer full?
jnz empty_8042 ! yes - loop
ret
gdt:
0,0,0,0 ! dummy
0x07FF ! 8Mb - limit=2047 (2048*4096=8Mb)
0x0000 ! base address=0
0x9A00 ! code read/exec
0x00C0 ! granularity=4096, 386
0x07FF ! 8Mb - limit=2047 (2048*4096=8Mb)
0x0000 ! base address=0
0x9200 ! data read/write
0x00C0 ! granularity=4096, 386
idt_48:
0 ! idt limit=0
0,0 ! idt base=0L
gdt_48:
0x800 ! gdt limit=2048, 256 GDT entries
512+gdt,0x9 ! gdt base = 0X9xxxx
.text
endtext:
.data
enddata:
.bss
endbss:
注:GDT,LDT,IDT表是什么?
GDT(global descriptor table),全局段描述符表,这些64kb数据整齐的排列在内存中某一位置。而该位置的内存地址以及有效的个数就存放在GDTR中,GDTR是特殊的寄存器。GDT在系统内只存在一个。
LDT(local descripotr table),局部段描述符表,LDT在系统内可存在多个,每个任务最多只能拥有一个LDT,另外,每一个LDT自身作为一个段存在,它们的段描述符被放在GDT中。
IDT(interrupt descriptor table),中断描述符表,IDT记录了0~255的中断号码和中断服务函数的关系。当发生中断的时候,通过中断号码去执行中断服务函数。
GDT可以被放在内存的任何位置,那么当程序员通过段寄存器来引用一个段描述符时,CPU必须知道GDT的入口,也就是基地址放在哪里,所以Intel的设计者门提供了一个寄存器GDTR用来存放GDT的入口地址,程序员将GDT设定在内存中某个位置之后,可以通过LGDT指令将GDT的入口地址装入此寄存器,从此以后,CPU就根据此寄存器中的内容作为GDT的入口来访问GDT了。
IA-32为LDT的入口地址也提供了一个寄存器LDTR,因为在任何时刻只能有一个任务在运行,所以LDT寄存器全局也只需要有一个。如果一个任务拥有自身的LDT,那么当它需要引用自身的LDT时,它需要通过LLDT指令将其LDT的段描述符装入此寄存器。LLDT指令与LGDT指令不同的时,LGDT指令的操作数是一个32-bit的内存地址,这个内存地址处存放的是一个32-bit GDT的入口地址,以及16-bit的GDT Limit。而LLDT指令的操作数是一个16-bit的选择子,这个选择子主要内容是:被装入的LDT的段描述符在GDT中的索引值。
4、head.s(WORKING)
加载内核运行时的各数据段寄存器,重新设置中断描述符表,开启内核正常运行时的协处理器等资源;设置内存管理的分页机制,跳转到main.c运行。
* head.s contains the 32-bit startup code.
*
* NOTE!!! Startup happens at absolute address 0x00000000, which is also where
* the page directory will exist. The startup code will be overwritten by
* the page directory.
*/
.text
.globl _idt,_gdt,_pg_dir,_tmp_floppy_area
_pg_dir:
startup_32:
movl $0x10,%eax
mov %ax,%ds
mov %ax,%es
mov %ax,%fs
mov %ax,%gs //上面是重新加载寄存器
lss _stack_start,%esp //lss _stack_start,%esp是将结构体 stact_start 的值传送到ss:esp,即令 ss=0x10(段选择子)和 esp=& user_stack [PAGE_SIZE>>2]
call setup_idt //设置idt和gdt
call setup_gdt
movl $0x10,%eax # reload all the segment registers
mov %ax,%ds # after changing gdt. CS was already
mov %ax,%es # reloaded in 'setup_gdt'
mov %ax,%fs
mov %ax,%gs
lss _stack_start,%esp
xorl %eax,%eax
1: incl %eax # check that A20 really IS enabled
movl %eax,0x000000 # loop forever if it isn't
cmpl %eax,0x100000
je 1b
/*
* NOTE! 486 should set bit 16, to check for write-protect in supervisor
* mode. Then it would be unnecessary with the "verify_area()"-calls.
* 486 users probably want to set the NE (#5) bit also, so as to use
* int 16 for math errors.
*/
movl %cr0,%eax # check math chip
andl $0x80000011,%eax # Save PG,PE,ET
/* "orl $0x10020,%eax" here for 486 might be good */
orl $2,%eax # set MP
movl %eax,%cr0
call check_x87
jmp after_page_tables
/*
* We depend on ET to be correct. This checks for 287/387.
*/
check_x87:
fninit
fstsw %ax
cmpb $0,%al
je 1f /* no coprocessor: have to set bits */
movl %cr0,%eax
xorl $6,%eax /* reset MP, set EM */
movl %eax,%cr0
ret
.align 2
1: .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
ret
/*
* setup_idt
*
* sets up a idt with 256 entries pointing to
* ignore_int, interrupt gates. It then loads
* idt. Everything that wants to install itself
* in the idt-table may do so themselves. Interrupts
* are enabled elsewhere, when we can be relatively
* sure everything is ok. This routine will be over-
* written by the page tables.
*/
setup_idt:
lea ignore_int,%edx //将ignore_int的有效地址存到edx
movl $0x00080000,%eax //将0x8000放入eax的高16位
movw %dx,%ax /* selector = 0x0008 = cs 将ignore_int有效地址存到eax低16字节 */
movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
lea _idt,%edi //
mov $256,%ecx
rp_sidt:
movl %eax,(%edi)
movl %edx,4(%edi)
addl $8,%edi
dec %ecx
jne rp_sidt
lidt idt_descr
ret
/*
* setup_gdt
*
* This routines sets up a new gdt and loads it.
* Only two entries are currently built, the same
* ones that were built in init.s. The routine
* is VERY complicated at two whole lines, so this
* rather long comment is certainly needed :-).
* This routine will beoverwritten by the page tables.
*/
setup_gdt:
lgdt gdt_descr
ret
/*
* I put the kernel page tables right after the page directory,
* using 4 of them to span 16 Mb of physical memory. People with
* more than 16MB will have to expand this.
*/
.org 0x1000
pg0:
.org 0x2000
pg1:
.org 0x3000
pg2:
.org 0x4000
pg3:
.org 0x5000
/*
* tmp_floppy_area is used by the floppy-driver when DMA cannot
* reach to a buffer-block. It needs to be aligned, so that it isn't
* on a 64kB border.
*/
_tmp_floppy_area:
.fill 1024,1,0
after_page_tables:
pushl $0 # These are the parameters to main :-)
pushl $0
pushl $0
pushl $L6 # return address for main, if it decides to.
pushl $_main
jmp setup_paging
L6:
jmp L6 # main should never return here, but
# just in case, we know what happens.
/* This is the default interrupt "handler" :-) */
int_msg:
.asciz "Unknown interruptnr"
.align 2
ignore_int:
pushl %eax
pushl %ecx
pushl %edx
push %ds
push %es
push %fs
movl $0x10,%eax
mov %ax,%ds
mov %ax,%es
mov %ax,%fs
pushl $int_msg
call _printk
popl %eax
pop %fs
pop %es
pop %ds
popl %edx
popl %ecx
popl %eax
iret
/*
* Setup_paging
*
* This routine sets up paging by setting the page bit
* in cr0. The page tables are set up, identity-mapping
* the first 16MB. The pager assumes that no illegal
* addresses are produced (ie >4Mb on a 4Mb machine).
*
* NOTE! Although all physical memory should be identity
* mapped by this routine, only the kernel page functions
* use the >1Mb addresses directly. All "normal" functions
* use just the lower 1Mb, or the local data space, which
* will be mapped to some other place - mm keeps track of
* that.
*
* For those with more memory than 16 Mb - tough luck. I've
* not got it, why should you :-) The source is here. Change
* it. (Seriously - it shouldn't be too difficult. Mostly
* change some constants etc. I left it at 16Mb, as my machine
* even cannot be extended past that (ok, but it was cheap :-)
* I've tried to show which constants to change by having
* some kind of marker at them (search for "16Mb"), but I
* won't guarantee that's all :-( )
*/
.align 2
setup_paging:
movl $1024*5,%ecx /* 5 pages - pg_dir+4 page tables */
xorl %eax,%eax
xorl %edi,%edi /* pg_dir is at 0x000 */
cld;rep;stosl
movl $pg0+7,_pg_dir /* set present bit/user r/w */
movl $pg1+7,_pg_dir+4 /* --------- " " --------- */
movl $pg2+7,_pg_dir+8 /* --------- " " --------- */
movl $pg3+7,_pg_dir+12 /* --------- " " --------- */
movl $pg3+4092,%edi
movl $0xfff007,%eax /* 16Mb - 4096 + 7 (r/w user,p) */
std
1: stosl /* fill pages backwards - more efficient :-) */
subl $0x1000,%eax
jge 1b
xorl %eax,%eax /* pg_dir is at 0x0000 */
movl %eax,%cr3 /* cr3 - page directory start */
movl %cr0,%eax
orl $0x80000000,%eax
movl %eax,%cr0 /* set paging (PG) bit */
ret /* this also flushes prefetch-queue */
.align 2
.word 0
idt_descr:
.word 256*8-1 # idt contains 256 entries
.long _idt
.align 2
.word 0
gdt_descr:
.word 256*8-1 # so does gdt (not that that's any
.long _gdt # magic number, but it works for me :^)
.align 3
_idt: .fill 256,8,0 # idt is uninitialized
_gdt: .quad 0x0000000000000000 /* NULL descriptor */
.quad 0x00c09a0000000fff /* 16Mb */
.quad 0x00c0920000000fff /* 16Mb */
.quad 0x0000000000000000 /* TEMPORARY - don't use */
.fill 252,8,0
5、main.c(WORKING)
void main(void) /* This really IS void, no error here. */
{ /* The startup routine assumes (well, ...) this */
/*
* Interrupts are still disabled. Do necessary setups, then
* enable them
*/
ROOT_DEV = ORIG_ROOT_DEV;
drive_info = DRIVE_INFO;
memory_end = (1<<20) + (EXT_MEM_K<<10);
memory_end &= 0xfffff000;
if (memory_end > 16*1024*1024)
memory_end = 16*1024*1024;
if (memory_end > 12*1024*1024)
buffer_memory_end = 4*1024*1024;
else if (memory_end > 6*1024*1024)
buffer_memory_end = 2*1024*1024;
else
buffer_memory_end = 1*1024*1024;
main_memory_start = buffer_memory_end;
main_memory_start += rd_init(main_memory_start, RAMDISK*1024);
mem_init(main_memory_start,memory_end);
trap_init();
blk_dev_init();
chr_dev_init();
tty_init();
time_init();
sched_init();
buffer_init(buffer_memory_end);
hd_init();
floppy_init();
sti();
move_to_user_mode();
if (!fork()) { /* we count on this going ok */
init(); //init函数在三.1有分析
}
/*
* NOTE!! For any other task 'pause()' would mean we have to get a
* signal to awaken, but task0 is the sole exception (see 'schedule()')
* as task 0 gets activated at every idle moment (when no other tasks
* can run). For task0 'pause()' just means we go check if some other
* task can run, and if not we return here.
*/
for(;;) pause();
}
五
信号概述
内核的信号量是很重要的,关于信号的定义在/include/signal.h文件内,比如运行一个elf文件可能会出现段错误(SIGSEGV),玩pwn的同学应该很熟悉。在system_call.s中存在call do_signal,那么do_signal在/kernel/signal.c内定义。
硬件来源:信号由硬件驱动产生
软件来源:系统提供了些API,例如kill命令
当进程收到信号时,会有三种场景;
忽略:忽略信号
执行:执行每个信号所对应的操作
执行自定操作:用户自定义的操作
① 在系统中什么是信号,都有什么信号?
② 在系统接收到信号后,是如何进行处理的?
③ 信号作用。
1、signal.h
typedef int sig_atomic_t;
typedef unsigned int sigset_t; /* 32 bits */
/* Ok, I haven't implemented sigactions, but trying to keep headers POSIX */
struct sigaction { //信号结构体
void (*sa_handler)(int); //对应某信号指定要采取的行动,可以用上面的SIG_DFL和SIG_IGN
sigset_t sa_mask; //当前信号处理程序执行期间需要被屏蔽的信号
int sa_flags; //
void (*sa_restorer)(void); //恢复函数指针
};
void (*signal(int _sig, void (*_func)(int)))(int);
int raise(int sig);
int kill(pid_t pid, int sig);
int sigaddset(sigset_t *mask, int signo);
int sigdelset(sigset_t *mask, int signo);
int sigemptyset(sigset_t *mask);
int sigfillset(sigset_t *mask);
int sigismember(sigset_t *mask, int signo); /* 1 - is, 0 - not, -1 error */
int sigpending(sigset_t *set);
int sigprocmask(int how, sigset_t *set, sigset_t *oldset);
int sigsuspend(sigset_t *sigmask);
int sigaction(int sig, struct sigaction *act, struct sigaction *oldact);
2、signal.c
volatile void do_exit(int error_code);
int sys_sgetmask()
{
return current->blocked;
}
int sys_ssetmask(int newmask)
{
int old=current->blocked;
current->blocked = newmask & ~(1<<(SIGKILL-1));
return old;
}
static inline void save_old(char * from,char * to)
{
int i;
verify_area(to, sizeof(struct sigaction));
for (i=0 ; i< sizeof(struct sigaction) ; i++) {
put_fs_byte(*from,to);
from++;
to++;
}
}
static inline void get_new(char * from,char * to)
{
int i;
for (i=0 ; i< sizeof(struct sigaction) ; i++)
*(to++) = get_fs_byte(from++);
}
int sys_signal(int signum, long handler, long restorer) //signum是信号标号,handlers是信号处理的函数指针,restorer是恢复函数指针,即执行完signal系统调用后,恢复堆栈及返回值
{
struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL)
return -1;
tmp.sa_handler = (void (*)(int)) handler; //设置结构体
tmp.sa_mask = 0;
tmp.sa_flags = SA_ONESHOT | SA_NOMASK;
tmp.sa_restorer = (void (*)(void)) restorer;
handler = (long) current->sigaction[signum-1].sa_handler;
current->sigaction[signum-1] = tmp;
return handler;
}
int sys_sigaction(int signum, const struct sigaction * action,
struct sigaction * oldaction) //设置新信号处理结构体
{
struct sigaction tmp;
if (signum<1 || signum>32 || signum==SIGKILL) //若不符合信号值大小,直接返回
return -1;
tmp = current->sigaction[signum-1]; //信号值所对应的sigaction结构体
get_new((char *) action,
(char *) (signum-1+current->sigaction)); //设置新信号处理结构体
if (oldaction)
save_old((char *) &tmp,(char *) oldaction); //将old保存到tmp
if (current->sigaction[signum-1].sa_flags & SA_NOMASK) //如果允许处理信号过程中再次收到该信号,则屏蔽码置为0
current->sigaction[signum-1].sa_mask = 0;
else //否则,设置屏蔽本信号
current->sigaction[signum-1].sa_mask |= (1<<(signum-1));
return 0;
}
void do_signal(long signr,long eax, long ebx, long ecx, long edx,
long fs, long es, long ds,
long eip, long cs, long eflags,
unsigned long * esp, long ss) //signr是信号值,其余都是当前寄存器为参数
{
unsigned long sa_handler;
long old_eip=eip; //将用户态ip保存至old_eip
struct sigaction * sa = current->sigaction + signr - 1; //取出当前任务signr信号量所对应的sigaction结构体存入sa
int longs;
unsigned long * tmp_esp;
sa_handler = (unsigned long) sa->sa_handler; //取出信号处理函数指针
if (sa_handler==1) //若sa_handler是SIG_IGN,直接返回
return;
if (!sa_handler) { ///如果信号处理函数是 SIG_DFL,表示按默认方式处理
if (signr==SIGCHLD) //不作处理,直接返回
return;
else
do_exit(1<<(signr-1)); //否则终止进程,故默认处理方式一般效果是终止进程
}
if (sa->sa_flags & SA_ONESHOT) //如果只需调用一次信号处理,则将sa_handler置零
sa->sa_handler = NULL;
*(&eip) = sa_handler; //将用户返回地址换成信号处理函数
longs = (sa->sa_flags & SA_NOMASK)?7:8; //如果允许处理信号过程中再次收到该信号,longs 为 7,否则为 8
*(&esp) -= longs; //将用户栈腾出空间存放寄存器
verify_area(esp,longs*4);
tmp_esp=esp; //保存腾出空间之后的esp
put_fs_long((long) sa->sa_restorer,tmp_esp++); //存入恢复栈函数地址
put_fs_long(signr,tmp_esp++); //
if (!(sa->sa_flags & SA_NOMASK))
put_fs_long(current->blocked,tmp_esp++);
put_fs_long(eax,tmp_esp++); //下面的操作是将各种寄存器压入用户栈
put_fs_long(ecx,tmp_esp++);
put_fs_long(edx,tmp_esp++);
put_fs_long(eflags,tmp_esp++);
put_fs_long(old_eip,tmp_esp++);
current->blocked |= sa->sa_mask; //
}
3、sa_restorer
如果没有屏蔽码,使用该函数作为恢复函数 */
sig_restore:
addl $4,%esp /* 丢弃 signr */
popl %eax /* 系统调用返回值还原到 eax */
popl %ecx /* 还原 ecx,edx */
popl %edx
popfl /* 恢复 eflags */
ret
如果有屏蔽码,使用该函数 */
masksig_restore:
addl $4,%esp
call ssetmask /* 设置信号屏蔽码 */
addl $4,%esp /* 丢弃屏蔽码 */
popl %eax
popl %ecx
popl %edx
popfl
ret
六
文件系统
顾名思义就是文件所组成的一个系统,linux下所谓“一切皆文件”,所以文件系统在内核中占了很大比重。
Linux启动过程:
1、文件系统概述
文件系统主要包括四个部分:高速缓冲区管理,文件底层操作,文件数据访问,文件高层访问控制。
(1)文件系统底层函数
① bitmap.c
程序包括对i节点位图和逻辑块位图进行释放和占用处理函数。操作i节点位图的函数是free_inode()和new_inode(),操作逻辑块位图的函数是free_block()和new_block()。
② truncate.c
程序包括对数据文件长度截断为0的函数truncate(),他将i节点指定的设备上文件长度截为0,并释放文件数据占用的设备逻辑块。
③ inode.c
程序包括分配i节点函数iget()和放回对内存i节点存取函数iput()以及根据i节点信息取文件数据块在设备上对应的逻辑块号函数bmap()。
④ namei.c
程序主要包括函数namei(),该函数使用iget(),iput(),bmap()将给定的文件路径名映射到其i节点。
⑤ super.c
程序专门用于处理文件系统超级块,包括函数get_super(),put_super()和free_super()和free_super()等,还包括几个文件系统加载/卸载处理函数和系统调用,如sys_mount()等。
(2)文件中数据的访问操作
① block_dev.c
程序中的函数block_read()和block_write()是用于读写块设备特殊文件的数据,所使用的参数指定要访问的设备号,起始地址和长度
② file_dev.c
程序中的file_read()和file_write()函数是用于访问一般的文件,所使用的参数指定文件对应的i节点和文件结构。
③ pipe.c
文件中实现了管道读写函数read_pipe()和write_pipe(),另外还实现了创建无名管道的系统调用pipe(),
④ char_dev.c
系统调用使用read()和write()会调用char_dev.c中的rw_char()函数来操作。字符设备包括控制台终端,串口终端和内存字符设备。
(3)文件和目录管理系统调用
① open.c
文件用于实现与文件操作相关的系统调用,主要有文件的创建,打开和关闭,文件宿主和属性修改,文件访问权限和操作时间的修改等。
② exec.c
程序实现对二进制可执行文件和shell脚本文件的加载与执行,其中主要是的do_execve(),他是系统中断调用(int 0x80)的功能号__NR_execve()调用的C处理函数,更是exec()函数簇的主要实现函数。
③ fcntl.c
实现了文件控制操作的系统调用fcntl()和两个文件句柄(描述符)复制系统调用dup()和dup2(),dup2()指定了新句柄的数值,dup()则返回当前最小值的未用句柄。句柄复制操作主要用在文件的标准输入/输出重定向和管道操作方面。
④ ioctl.c
文件实现了输入/输出控制系统调用ioctl(),主要调用tty_ioctl()函数,对终端的I/O进行控制。
⑤ stat.c
文件用于实现取得文件状态信息的系统调用,stat()和fstat()。stat()是利用文件名取信息,而fstat()是利用文件句柄取信息。
2、高速缓冲区管理(buffer.c)
高速缓冲区位于内核代码与主内存区之间,在块设备与内核其他程序之间起着一个桥梁作用,除了块设备驱动程序以外,内核程序如果需要访问块设备中的数据,就需要通过高速缓冲区来进行操作。
看雪ID:e*16 a
https://bbs.kanxue.com/user-home-922338.htm
# 往期推荐
球分享
球点赞
球在看
点击“阅读原文”,了解更多!
原文始发于微信公众号(看雪学苑):Linux内核学习笔记